40 8 41 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright 2003 Andi Kleen, SuSE Labs. * * Modified for x86 32 bit architecture by * Stefani Seibold <stefani@seibold.net> * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany * * Thanks to hpa@transmeta.com for some useful hint. * Special thanks to Ingo Molnar for his early experience with * a different vsyscall implementation for Linux/IA32 and for the name. * */ #include <linux/timekeeper_internal.h> #include <asm/vgtod.h> #include <asm/vvar.h> int vclocks_used __read_mostly; DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); void update_vsyscall_tz(void) { vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; } void update_vsyscall(struct timekeeper *tk) { int vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; /* Mark the new vclock used. */ BUILD_BUG_ON(VCLOCK_MAX >= 32); WRITE_ONCE(vclocks_used, READ_ONCE(vclocks_used) | (1 << vclock_mode)); gtod_write_begin(vdata); /* copy vsyscall data */ vdata->vclock_mode = vclock_mode; vdata->cycle_last = tk->tkr_mono.cycle_last; vdata->mask = tk->tkr_mono.mask; vdata->mult = tk->tkr_mono.mult; vdata->shift = tk->tkr_mono.shift; vdata->wall_time_sec = tk->xtime_sec; vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; vdata->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + ((u64)tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); while (vdata->monotonic_time_snsec >= (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { vdata->monotonic_time_snsec -= ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; vdata->monotonic_time_sec++; } vdata->wall_time_coarse_sec = tk->xtime_sec; vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); vdata->monotonic_time_coarse_sec = vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; vdata->monotonic_time_coarse_nsec = vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; vdata->monotonic_time_coarse_sec++; } gtod_write_end(vdata); }
2959 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 /* * include/net/dsa.h - Driver for Distributed Switch Architecture switch chips * Copyright (c) 2008-2009 Marvell Semiconductor * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #ifndef __LINUX_NET_DSA_H #define __LINUX_NET_DSA_H #include <linux/if.h> #include <linux/if_ether.h> #include <linux/list.h> #include <linux/notifier.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/of.h> #include <linux/ethtool.h> #include <net/devlink.h> #include <net/switchdev.h> struct tc_action; struct phy_device; struct fixed_phy_status; enum dsa_tag_protocol { DSA_TAG_PROTO_NONE = 0, DSA_TAG_PROTO_BRCM, DSA_TAG_PROTO_DSA, DSA_TAG_PROTO_EDSA, DSA_TAG_PROTO_KSZ, DSA_TAG_PROTO_LAN9303, DSA_TAG_PROTO_MTK, DSA_TAG_PROTO_QCA, DSA_TAG_PROTO_TRAILER, DSA_TAG_LAST, /* MUST BE LAST */ }; #define DSA_MAX_SWITCHES 4 #define DSA_MAX_PORTS 12 #define DSA_RTABLE_NONE -1 struct dsa_chip_data { /* * How to access the switch configuration registers. */ struct device *host_dev; int sw_addr; /* * Reference to network devices */ struct device *netdev[DSA_MAX_PORTS]; /* set to size of eeprom if supported by the switch */ int eeprom_len; /* Device tree node pointer for this specific switch chip * used during switch setup in case additional properties * and resources needs to be used */ struct device_node *of_node; /* * The names of the switch's ports. Use "cpu" to * designate the switch port that the cpu is connected to, * "dsa" to indicate that this port is a DSA link to * another switch, NULL to indicate the port is unused, * or any other string to indicate this is a physical port. */ char *port_names[DSA_MAX_PORTS]; struct device_node *port_dn[DSA_MAX_PORTS]; /* * An array of which element [a] indicates which port on this * switch should be used to send packets to that are destined * for switch a. Can be NULL if there is only one switch chip. */ s8 rtable[DSA_MAX_SWITCHES]; }; struct dsa_platform_data { /* * Reference to a Linux network interface that connects * to the root switch chip of the tree. */ struct device *netdev; struct net_device *of_netdev; /* * Info structs describing each of the switch chips * connected via this network interface. */ int nr_chips; struct dsa_chip_data *chip; }; struct packet_type; struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); int (*flow_dissect)(const struct sk_buff *skb, __be16 *proto, int *offset); }; struct dsa_switch_tree { struct list_head list; /* Notifier chain for switch-wide events */ struct raw_notifier_head nh; /* Tree identifier */ u32 tree; /* Number of switches attached to this tree */ struct kref refcount; /* Has this tree been applied to the hardware? */ bool applied; /* * Configuration data for the platform device that owns * this dsa switch tree instance. */ struct dsa_platform_data *pd; /* Copy of tag_ops->rcv for faster access in hot path */ struct sk_buff * (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); /* * The switch port to which the CPU is attached. */ struct dsa_port *cpu_dp; /* * Data for the individual switch chips. */ struct dsa_switch *ds[DSA_MAX_SWITCHES]; /* * Tagging protocol operations for adding and removing an * encapsulation tag. */ const struct dsa_device_ops *tag_ops; }; /* TC matchall action types, only mirroring for now */ enum dsa_port_mall_action_type { DSA_PORT_MALL_MIRROR, }; /* TC mirroring entry */ struct dsa_mall_mirror_tc_entry { u8 to_local_port; bool ingress; }; /* TC matchall entry */ struct dsa_mall_tc_entry { struct list_head list; unsigned long cookie; enum dsa_port_mall_action_type type; union { struct dsa_mall_mirror_tc_entry mirror; }; }; struct dsa_port { struct dsa_switch *ds; unsigned int index; const char *name; struct dsa_port *cpu_dp; struct net_device *netdev; struct device_node *dn; unsigned int ageing_time; u8 stp_state; struct net_device *bridge_dev; struct devlink_port devlink_port; /* * Original copy of the master netdev ethtool_ops */ struct ethtool_ops ethtool_ops; const struct ethtool_ops *orig_ethtool_ops; }; struct dsa_switch { struct device *dev; /* * Parent switch tree, and switch index. */ struct dsa_switch_tree *dst; int index; /* Listener for switch fabric events */ struct notifier_block nb; /* * Give the switch driver somewhere to hang its private data * structure. */ void *priv; /* * Configuration data for this switch. */ struct dsa_chip_data *cd; /* * The switch operations. */ const struct dsa_switch_ops *ops; /* * An array of which element [a] indicates which port on this * switch should be used to send packets to that are destined * for switch a. Can be NULL if there is only one switch chip. */ s8 rtable[DSA_MAX_SWITCHES]; /* * Slave mii_bus and devices for the individual ports. */ u32 dsa_port_mask; u32 cpu_port_mask; u32 enabled_port_mask; u32 phys_mii_mask; struct mii_bus *slave_mii_bus; /* Ageing Time limits in msecs */ unsigned int ageing_time_min; unsigned int ageing_time_max; /* devlink used to represent this switch device */ struct devlink *devlink; /* Number of switch port queues */ unsigned int num_tx_queues; /* Dynamically allocated ports, keep last */ size_t num_ports; struct dsa_port ports[]; }; static inline bool dsa_is_cpu_port(struct dsa_switch *ds, int p) { return !!(ds->cpu_port_mask & (1 << p)); } static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) { return !!((ds->dsa_port_mask) & (1 << p)); } static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p) { return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p); } static inline u8 dsa_upstream_port(struct dsa_switch *ds) { struct dsa_switch_tree *dst = ds->dst; /* * If this is the root switch (i.e. the switch that connects * to the CPU), return the cpu port number on this switch. * Else return the (DSA) port number that connects to the * switch that is one hop closer to the cpu. */ if (dst->cpu_dp->ds == ds) return dst->cpu_dp->index; else return ds->rtable[dst->cpu_dp->ds->index]; } typedef int dsa_fdb_dump_cb_t(const unsigned char *addr, u16 vid, bool is_static, void *data); struct dsa_switch_ops { /* * Legacy probing. */ const char *(*probe)(struct device *dsa_dev, struct device *host_dev, int sw_addr, void **priv); enum dsa_tag_protocol (*get_tag_protocol)(struct dsa_switch *ds); int (*setup)(struct dsa_switch *ds); int (*set_addr)(struct dsa_switch *ds, u8 *addr); u32 (*get_phy_flags)(struct dsa_switch *ds, int port); /* * Access to the switch's PHY registers. */ int (*phy_read)(struct dsa_switch *ds, int port, int regnum); int (*phy_write)(struct dsa_switch *ds, int port, int regnum, u16 val); /* * Link state adjustment (called from libphy) */ void (*adjust_link)(struct dsa_switch *ds, int port, struct phy_device *phydev); void (*fixed_link_update)(struct dsa_switch *ds, int port, struct fixed_phy_status *st); /* * ethtool hardware statistics. */ void (*get_strings)(struct dsa_switch *ds, int port, uint8_t *data); void (*get_ethtool_stats)(struct dsa_switch *ds, int port, uint64_t *data); int (*get_sset_count)(struct dsa_switch *ds); /* * ethtool Wake-on-LAN */ void (*get_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); int (*set_wol)(struct dsa_switch *ds, int port, struct ethtool_wolinfo *w); /* * Suspend and resume */ int (*suspend)(struct dsa_switch *ds); int (*resume)(struct dsa_switch *ds); /* * Port enable/disable */ int (*port_enable)(struct dsa_switch *ds, int port, struct phy_device *phy); void (*port_disable)(struct dsa_switch *ds, int port, struct phy_device *phy); /* * Port's MAC EEE settings */ int (*set_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_eee *e); int (*get_mac_eee)(struct dsa_switch *ds, int port, struct ethtool_eee *e); /* EEPROM access */ int (*get_eeprom_len)(struct dsa_switch *ds); int (*get_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); int (*set_eeprom)(struct dsa_switch *ds, struct ethtool_eeprom *eeprom, u8 *data); /* * Register access. */ int (*get_regs_len)(struct dsa_switch *ds, int port); void (*get_regs)(struct dsa_switch *ds, int port, struct ethtool_regs *regs, void *p); /* * Bridge integration */ int (*set_ageing_time)(struct dsa_switch *ds, unsigned int msecs); int (*port_bridge_join)(struct dsa_switch *ds, int port, struct net_device *bridge); void (*port_bridge_leave)(struct dsa_switch *ds, int port, struct net_device *bridge); void (*port_stp_state_set)(struct dsa_switch *ds, int port, u8 state); void (*port_fast_age)(struct dsa_switch *ds, int port); /* * VLAN support */ int (*port_vlan_filtering)(struct dsa_switch *ds, int port, bool vlan_filtering); int (*port_vlan_prepare)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans); void (*port_vlan_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct switchdev_trans *trans); int (*port_vlan_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan); /* * Forwarding database */ int (*port_fdb_add)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid); int (*port_fdb_del)(struct dsa_switch *ds, int port, const unsigned char *addr, u16 vid); int (*port_fdb_dump)(struct dsa_switch *ds, int port, dsa_fdb_dump_cb_t *cb, void *data); /* * Multicast database */ int (*port_mdb_prepare)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans); void (*port_mdb_add)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb, struct switchdev_trans *trans); int (*port_mdb_del)(struct dsa_switch *ds, int port, const struct switchdev_obj_port_mdb *mdb); /* * RXNFC */ int (*get_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc, u32 *rule_locs); int (*set_rxnfc)(struct dsa_switch *ds, int port, struct ethtool_rxnfc *nfc); /* * TC integration */ int (*port_mirror_add)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror, bool ingress); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); /* * Cross-chip operations */ int (*crosschip_bridge_join)(struct dsa_switch *ds, int sw_index, int port, struct net_device *br); void (*crosschip_bridge_leave)(struct dsa_switch *ds, int sw_index, int port, struct net_device *br); }; struct dsa_switch_driver { struct list_head list; const struct dsa_switch_ops *ops; }; /* Legacy driver registration */ void register_switch_driver(struct dsa_switch_driver *type); void unregister_switch_driver(struct dsa_switch_driver *type); struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); struct net_device *dsa_dev_to_net_device(struct device *dev); /* Keep inline for faster access in hot path */ static inline bool netdev_uses_dsa(struct net_device *dev) { #if IS_ENABLED(CONFIG_NET_DSA) return dev->dsa_ptr && dev->dsa_ptr->rcv; #endif return false; } struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n); void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); #else static inline int dsa_switch_suspend(struct dsa_switch *ds) { return 0; } static inline int dsa_switch_resume(struct dsa_switch *ds) { return 0; } #endif /* CONFIG_PM_SLEEP */ #endif
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 /* * linux/fs/hfsplus/part_tbl.c * * Copyright (C) 1996-1997 Paul H. Hargrove * This file may be distributed under the terms of * the GNU General Public License. * * Original code to handle the new style Mac partition table based on * a patch contributed by Holger Schemel (aeglos@valinor.owl.de). * * In function preconditions the term "valid" applied to a pointer to * a structure means that the pointer is non-NULL and the structure it * points to has all fields initialized to consistent values. * */ #include <linux/slab.h> #include "hfsplus_fs.h" /* offsets to various blocks */ #define HFS_DD_BLK 0 /* Driver Descriptor block */ #define HFS_PMAP_BLK 1 /* First block of partition map */ #define HFS_MDB_BLK 2 /* Block (w/i partition) of MDB */ /* magic numbers for various disk blocks */ #define HFS_DRVR_DESC_MAGIC 0x4552 /* "ER": driver descriptor map */ #define HFS_OLD_PMAP_MAGIC 0x5453 /* "TS": old-type partition map */ #define HFS_NEW_PMAP_MAGIC 0x504D /* "PM": new-type partition map */ #define HFS_SUPER_MAGIC 0x4244 /* "BD": HFS MDB (super block) */ #define HFS_MFS_SUPER_MAGIC 0xD2D7 /* MFS MDB (super block) */ /* * The new style Mac partition map * * For each partition on the media there is a physical block (512-byte * block) containing one of these structures. These blocks are * contiguous starting at block 1. */ struct new_pmap { __be16 pmSig; /* signature */ __be16 reSigPad; /* padding */ __be32 pmMapBlkCnt; /* partition blocks count */ __be32 pmPyPartStart; /* physical block start of partition */ __be32 pmPartBlkCnt; /* physical block count of partition */ u8 pmPartName[32]; /* (null terminated?) string giving the name of this partition */ u8 pmPartType[32]; /* (null terminated?) string giving the type of this partition */ /* a bunch more stuff we don't need */ } __packed; /* * The old style Mac partition map * * The partition map consists for a 2-byte signature followed by an * array of these structures. The map is terminated with an all-zero * one of these. */ struct old_pmap { __be16 pdSig; /* Signature bytes */ struct old_pmap_entry { __be32 pdStart; __be32 pdSize; __be32 pdFSID; } pdEntry[42]; } __packed; static int hfs_parse_old_pmap(struct super_block *sb, struct old_pmap *pm, sector_t *part_start, sector_t *part_size) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); int i; for (i = 0; i < 42; i++) { struct old_pmap_entry *p = &pm->pdEntry[i]; if (p->pdStart && p->pdSize && p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && (sbi->part < 0 || sbi->part == i)) { *part_start += be32_to_cpu(p->pdStart); *part_size = be32_to_cpu(p->pdSize); return 0; } } return -ENOENT; } static int hfs_parse_new_pmap(struct super_block *sb, void *buf, struct new_pmap *pm, sector_t *part_start, sector_t *part_size) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); int size = be32_to_cpu(pm->pmMapBlkCnt); int buf_size = hfsplus_min_io_size(sb); int res; int i = 0; do { if (!memcmp(pm->pmPartType, "Apple_HFS", 9) && (sbi->part < 0 || sbi->part == i)) { *part_start += be32_to_cpu(pm->pmPyPartStart); *part_size = be32_to_cpu(pm->pmPartBlkCnt); return 0; } if (++i >= size) return -ENOENT; pm = (struct new_pmap *)((u8 *)pm + HFSPLUS_SECTOR_SIZE); if ((u8 *)pm - (u8 *)buf >= buf_size) { res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK + i, buf, (void **)&pm, REQ_OP_READ, 0); if (res) return res; } } while (pm->pmSig == cpu_to_be16(HFS_NEW_PMAP_MAGIC)); return -ENOENT; } /* * Parse the partition map looking for the start and length of a * HFS/HFS+ partition. */ int hfs_part_find(struct super_block *sb, sector_t *part_start, sector_t *part_size) { void *buf, *data; int res; buf = kmalloc(hfsplus_min_io_size(sb), GFP_KERNEL); if (!buf) return -ENOMEM; res = hfsplus_submit_bio(sb, *part_start + HFS_PMAP_BLK, buf, &data, REQ_OP_READ, 0); if (res) goto out; switch (be16_to_cpu(*((__be16 *)data))) { case HFS_OLD_PMAP_MAGIC: res = hfs_parse_old_pmap(sb, data, part_start, part_size); break; case HFS_NEW_PMAP_MAGIC: res = hfs_parse_new_pmap(sb, buf, data, part_start, part_size); break; default: res = -ENOENT; break; } out: kfree(buf); return res; }
3 2 1 1 1 1 2 3 3 3 3 3 3 2 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 /* * Copyright (c) 2000-2001 Christoph Hellwig. * Copyright (c) 2016 Krzysztof Blaszkowski * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions, and the following disclaimer, * without modification. * 2. The name of the author may not be used to endorse or promote products * derived from this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL"). * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * Veritas filesystem driver - superblock related routines. */ #include <linux/init.h> #include <linux/module.h> #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/stat.h> #include <linux/vfs.h> #include <linux/mount.h> #include "vxfs.h" #include "vxfs_extern.h" #include "vxfs_dir.h" #include "vxfs_inode.h" MODULE_AUTHOR("Christoph Hellwig, Krzysztof Blaszkowski"); MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver"); MODULE_LICENSE("Dual BSD/GPL"); static struct kmem_cache *vxfs_inode_cachep; /** * vxfs_put_super - free superblock resources * @sbp: VFS superblock. * * Description: * vxfs_put_super frees all resources allocated for @sbp * after the last instance of the filesystem is unmounted. */ static void vxfs_put_super(struct super_block *sbp) { struct vxfs_sb_info *infp = VXFS_SBI(sbp); iput(infp->vsi_fship); iput(infp->vsi_ilist); iput(infp->vsi_stilist); brelse(infp->vsi_bp); kfree(infp); } /** * vxfs_statfs - get filesystem information * @dentry: VFS dentry to locate superblock * @bufp: output buffer * * Description: * vxfs_statfs fills the statfs buffer @bufp with information * about the filesystem described by @dentry. * * Returns: * Zero. * * Locking: * No locks held. * * Notes: * This is everything but complete... */ static int vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) { struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb); struct vxfs_sb *raw_sb = infp->vsi_raw; bufp->f_type = VXFS_SUPER_MAGIC; bufp->f_bsize = dentry->d_sb->s_blocksize; bufp->f_blocks = fs32_to_cpu(infp, raw_sb->vs_dsize); bufp->f_bfree = fs32_to_cpu(infp, raw_sb->vs_free); bufp->f_bavail = 0; bufp->f_files = 0; bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree); bufp->f_namelen = VXFS_NAMELEN; return 0; } static int vxfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); *flags |= MS_RDONLY; return 0; } static struct inode *vxfs_alloc_inode(struct super_block *sb) { struct vxfs_inode_info *vi; vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL); if (!vi) return NULL; inode_init_once(&vi->vfs_inode); return &vi->vfs_inode; } static void vxfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(vxfs_inode_cachep, VXFS_INO(inode)); } static void vxfs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, vxfs_i_callback); } static const struct super_operations vxfs_super_ops = { .alloc_inode = vxfs_alloc_inode, .destroy_inode = vxfs_destroy_inode, .evict_inode = vxfs_evict_inode, .put_super = vxfs_put_super, .statfs = vxfs_statfs, .remount_fs = vxfs_remount, }; static int vxfs_try_sb_magic(struct super_block *sbp, int silent, unsigned blk, __fs32 magic) { struct buffer_head *bp; struct vxfs_sb *rsbp; struct vxfs_sb_info *infp = VXFS_SBI(sbp); int rc = -ENOMEM; bp = sb_bread(sbp, blk); do { if (!bp || !buffer_mapped(bp)) { if (!silent) { printk(KERN_WARNING "vxfs: unable to read disk superblock at %u\n", blk); } break; } rc = -EINVAL; rsbp = (struct vxfs_sb *)bp->b_data; if (rsbp->vs_magic != magic) { if (!silent) printk(KERN_NOTICE "vxfs: WRONG superblock magic %08x at %u\n", rsbp->vs_magic, blk); break; } rc = 0; infp->vsi_raw = rsbp; infp->vsi_bp = bp; } while (0); if (rc) { infp->vsi_raw = NULL; infp->vsi_bp = NULL; brelse(bp); } return rc; } /** * vxfs_read_super - read superblock into memory and initialize filesystem * @sbp: VFS superblock (to fill) * @dp: fs private mount data * @silent: do not complain loudly when sth is wrong * * Description: * We are called on the first mount of a filesystem to read the * superblock into memory and do some basic setup. * * Returns: * The superblock on success, else %NULL. * * Locking: * We are under @sbp->s_lock. */ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) { struct vxfs_sb_info *infp; struct vxfs_sb *rsbp; u_long bsize; struct inode *root; int ret = -EINVAL; u32 j; sbp->s_flags |= MS_RDONLY; infp = kzalloc(sizeof(*infp), GFP_KERNEL); if (!infp) { printk(KERN_WARNING "vxfs: unable to allocate incore superblock\n"); return -ENOMEM; } bsize = sb_min_blocksize(sbp, BLOCK_SIZE); if (!bsize) { printk(KERN_WARNING "vxfs: unable to set blocksize\n"); goto out; } sbp->s_op = &vxfs_super_ops; sbp->s_fs_info = infp; if (!vxfs_try_sb_magic(sbp, silent, 1, (__force __fs32)cpu_to_le32(VXFS_SUPER_MAGIC))) { /* Unixware, x86 */ infp->byte_order = VXFS_BO_LE; } else if (!vxfs_try_sb_magic(sbp, silent, 8, (__force __fs32)cpu_to_be32(VXFS_SUPER_MAGIC))) { /* HP-UX, parisc */ infp->byte_order = VXFS_BO_BE; } else { if (!silent) printk(KERN_NOTICE "vxfs: can't find superblock.\n"); goto out; } rsbp = infp->vsi_raw; j = fs32_to_cpu(infp, rsbp->vs_version); if ((j < 2 || j > 4) && !silent) { printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", j); goto out; } #ifdef DIAGNOSTIC printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", j); printk(KERN_DEBUG "vxfs: blocksize: %d\n", fs32_to_cpu(infp, rsbp->vs_bsize)); #endif sbp->s_magic = fs32_to_cpu(infp, rsbp->vs_magic); infp->vsi_oltext = fs32_to_cpu(infp, rsbp->vs_oltext[0]); infp->vsi_oltsize = fs32_to_cpu(infp, rsbp->vs_oltsize); j = fs32_to_cpu(infp, rsbp->vs_bsize); if (!sb_set_blocksize(sbp, j)) { printk(KERN_WARNING "vxfs: unable to set final block size\n"); goto out; } if (vxfs_read_olt(sbp, bsize)) { printk(KERN_WARNING "vxfs: unable to read olt\n"); goto out; } if (vxfs_read_fshead(sbp)) { printk(KERN_WARNING "vxfs: unable to read fshead\n"); goto out; } root = vxfs_iget(sbp, VXFS_ROOT_INO); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } sbp->s_root = d_make_root(root); if (!sbp->s_root) { printk(KERN_WARNING "vxfs: unable to get root dentry.\n"); goto out_free_ilist; } return 0; out_free_ilist: iput(infp->vsi_fship); iput(infp->vsi_ilist); iput(infp->vsi_stilist); out: brelse(infp->vsi_bp); kfree(infp); return ret; } /* * The usual module blurb. */ static struct dentry *vxfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, vxfs_fill_super); } static struct file_system_type vxfs_fs_type = { .owner = THIS_MODULE, .name = "vxfs", .mount = vxfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("vxfs"); /* makes mount -t vxfs autoload the module */ MODULE_ALIAS("vxfs"); static int __init vxfs_init(void) { int rv; vxfs_inode_cachep = kmem_cache_create("vxfs_inode", sizeof(struct vxfs_inode_info), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); if (!vxfs_inode_cachep) return -ENOMEM; rv = register_filesystem(&vxfs_fs_type); if (rv < 0) kmem_cache_destroy(vxfs_inode_cachep); return rv; } static void __exit vxfs_cleanup(void) { unregister_filesystem(&vxfs_fs_type); /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(vxfs_inode_cachep); } module_init(vxfs_init); module_exit(vxfs_cleanup);
11 9 12 11 12 8 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 /* * common LSM auditing functions * * Based on code written for SELinux by : * Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * Author : Etienne Basset, <etienne.basset@ensta.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2, * as published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/gfp.h> #include <linux/fs.h> #include <linux/init.h> #include <net/sock.h> #include <linux/un.h> #include <net/af_unix.h> #include <linux/audit.h> #include <linux/ipv6.h> #include <linux/ip.h> #include <net/ip.h> #include <net/ipv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <linux/lsm_audit.h> /** * ipv4_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv4_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int ret = 0; struct iphdr *ih; ih = ip_hdr(skb); if (ih == NULL) return -EINVAL; ad->u.net->v4info.saddr = ih->saddr; ad->u.net->v4info.daddr = ih->daddr; if (proto) *proto = ih->protocol; /* non initial fragment */ if (ntohs(ih->frag_off) & IP_OFFSET) return 0; switch (ih->protocol) { case IPPROTO_TCP: { struct tcphdr *th = tcp_hdr(skb); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr *uh = udp_hdr(skb); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr *dh = dccp_hdr(skb); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr *sh = sctp_hdr(skb); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #if IS_ENABLED(CONFIG_IPV6) /** * ipv6_skb_to_auditdata : fill auditdata from skb * @skb : the skb * @ad : the audit data to fill * @proto : the layer 4 protocol * * return 0 on success */ int ipv6_skb_to_auditdata(struct sk_buff *skb, struct common_audit_data *ad, u8 *proto) { int offset, ret = 0; struct ipv6hdr *ip6; u8 nexthdr; __be16 frag_off; ip6 = ipv6_hdr(skb); if (ip6 == NULL) return -EINVAL; ad->u.net->v6info.saddr = ip6->saddr; ad->u.net->v6info.daddr = ip6->daddr; ret = 0; /* IPv6 can have several extension header before the Transport header * skip them */ offset = skb_network_offset(skb); offset += sizeof(*ip6); nexthdr = ip6->nexthdr; offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off); if (offset < 0) return 0; if (proto) *proto = nexthdr; switch (nexthdr) { case IPPROTO_TCP: { struct tcphdr _tcph, *th; th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); if (th == NULL) break; ad->u.net->sport = th->source; ad->u.net->dport = th->dest; break; } case IPPROTO_UDP: { struct udphdr _udph, *uh; uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); if (uh == NULL) break; ad->u.net->sport = uh->source; ad->u.net->dport = uh->dest; break; } case IPPROTO_DCCP: { struct dccp_hdr _dccph, *dh; dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); if (dh == NULL) break; ad->u.net->sport = dh->dccph_sport; ad->u.net->dport = dh->dccph_dport; break; } case IPPROTO_SCTP: { struct sctphdr _sctph, *sh; sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); if (sh == NULL) break; ad->u.net->sport = sh->source; ad->u.net->dport = sh->dest; break; } default: ret = -EINVAL; } return ret; } #endif static inline void print_ipv6_addr(struct audit_buffer *ab, struct in6_addr *addr, __be16 port, char *name1, char *name2) { if (!ipv6_addr_any(addr)) audit_log_format(ab, " %s=%pI6c", name1, addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr, __be16 port, char *name1, char *name2) { if (addr) audit_log_format(ab, " %s=%pI4", name1, &addr); if (port) audit_log_format(ab, " %s=%d", name2, ntohs(port)); } /** * dump_common_audit_data - helper to dump common audit data * @a : common audit data * */ static void dump_common_audit_data(struct audit_buffer *ab, struct common_audit_data *a) { char comm[sizeof(current->comm)]; /* * To keep stack sizes in check force programers to notice if they * start making this union too large! See struct lsm_network_audit * as an example of how to deal with large data. */ BUILD_BUG_ON(sizeof(a->u) > sizeof(void *)*2); audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current)); audit_log_untrustedstring(ab, memcpy(comm, current->comm, sizeof(comm))); switch (a->type) { case LSM_AUDIT_DATA_NONE: return; case LSM_AUDIT_DATA_IPC: audit_log_format(ab, " key=%d ", a->u.ipc_id); break; case LSM_AUDIT_DATA_CAP: audit_log_format(ab, " capability=%d ", a->u.cap); break; case LSM_AUDIT_DATA_PATH: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.path); inode = d_backing_inode(a->u.path.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_FILE: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.file->f_path); inode = file_inode(a->u.file); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_IOCTL_OP: { struct inode *inode; audit_log_d_path(ab, " path=", &a->u.op->path); inode = a->u.op->path.dentry->d_inode; if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); break; } case LSM_AUDIT_DATA_DENTRY: { struct inode *inode; audit_log_format(ab, " name="); spin_lock(&a->u.dentry->d_lock); audit_log_untrustedstring(ab, a->u.dentry->d_name.name); spin_unlock(&a->u.dentry->d_lock); inode = d_backing_inode(a->u.dentry); if (inode) { audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } break; } case LSM_AUDIT_DATA_INODE: { struct dentry *dentry; struct inode *inode; inode = a->u.inode; dentry = d_find_alias(inode); if (dentry) { audit_log_format(ab, " name="); spin_lock(&dentry->d_lock); audit_log_untrustedstring(ab, dentry->d_name.name); spin_unlock(&dentry->d_lock); dput(dentry); } audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); break; } case LSM_AUDIT_DATA_TASK: { struct task_struct *tsk = a->u.tsk; if (tsk) { pid_t pid = task_tgid_nr(tsk); if (pid) { char comm[sizeof(tsk->comm)]; audit_log_format(ab, " opid=%d ocomm=", pid); audit_log_untrustedstring(ab, memcpy(comm, tsk->comm, sizeof(comm))); } } break; } case LSM_AUDIT_DATA_NET: if (a->u.net->sk) { struct sock *sk = a->u.net->sk; struct unix_sock *u; struct unix_address *addr; int len = 0; char *p = NULL; switch (sk->sk_family) { case AF_INET: { struct inet_sock *inet = inet_sk(sk); print_ipv4_addr(ab, inet->inet_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv4_addr(ab, inet->inet_daddr, inet->inet_dport, "faddr", "fport"); break; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct inet_sock *inet = inet_sk(sk); print_ipv6_addr(ab, &sk->sk_v6_rcv_saddr, inet->inet_sport, "laddr", "lport"); print_ipv6_addr(ab, &sk->sk_v6_daddr, inet->inet_dport, "faddr", "fport"); break; } #endif case AF_UNIX: u = unix_sk(sk); addr = smp_load_acquire(&u->addr); if (!addr) break; if (u->path.dentry) { audit_log_d_path(ab, " path=", &u->path); break; } len = addr->len-sizeof(short); p = &addr->name->sun_path[0]; audit_log_format(ab, " path="); if (*p) audit_log_untrustedstring(ab, p); else audit_log_n_hex(ab, p, len); break; } } switch (a->u.net->family) { case AF_INET: print_ipv4_addr(ab, a->u.net->v4info.saddr, a->u.net->sport, "saddr", "src"); print_ipv4_addr(ab, a->u.net->v4info.daddr, a->u.net->dport, "daddr", "dest"); break; case AF_INET6: print_ipv6_addr(ab, &a->u.net->v6info.saddr, a->u.net->sport, "saddr", "src"); print_ipv6_addr(ab, &a->u.net->v6info.daddr, a->u.net->dport, "daddr", "dest"); break; } if (a->u.net->netif > 0) { struct net_device *dev; /* NOTE: we always use init's namespace */ dev = dev_get_by_index(&init_net, a->u.net->netif); if (dev) { audit_log_format(ab, " netif=%s", dev->name); dev_put(dev); } } break; #ifdef CONFIG_KEYS case LSM_AUDIT_DATA_KEY: audit_log_format(ab, " key_serial=%u", a->u.key_struct.key); if (a->u.key_struct.key_desc) { audit_log_format(ab, " key_desc="); audit_log_untrustedstring(ab, a->u.key_struct.key_desc); } break; #endif case LSM_AUDIT_DATA_KMOD: audit_log_format(ab, " kmod="); audit_log_untrustedstring(ab, a->u.kmod_name); break; case LSM_AUDIT_DATA_IBPKEY: { struct in6_addr sbn_pfx; memset(&sbn_pfx.s6_addr, 0, sizeof(sbn_pfx.s6_addr)); memcpy(&sbn_pfx.s6_addr, &a->u.ibpkey->subnet_prefix, sizeof(a->u.ibpkey->subnet_prefix)); audit_log_format(ab, " pkey=0x%x subnet_prefix=%pI6c", a->u.ibpkey->pkey, &sbn_pfx); break; } case LSM_AUDIT_DATA_IBENDPORT: audit_log_format(ab, " device=%s port_num=%u", a->u.ibendport->dev_name, a->u.ibendport->port); break; } /* switch (a->type) */ } /** * common_lsm_audit - generic LSM auditing function * @a: auxiliary audit data * @pre_audit: lsm-specific pre-audit callback * @post_audit: lsm-specific post-audit callback * * setup the audit buffer for common security information * uses callback to print LSM specific information */ void common_lsm_audit(struct common_audit_data *a, void (*pre_audit)(struct audit_buffer *, void *), void (*post_audit)(struct audit_buffer *, void *)) { struct audit_buffer *ab; if (a == NULL) return; /* we use GFP_ATOMIC so we won't sleep */ ab = audit_log_start(current->audit_context, GFP_ATOMIC | __GFP_NOWARN, AUDIT_AVC); if (ab == NULL) return; if (pre_audit) pre_audit(ab, a); dump_common_audit_data(ab, a); if (post_audit) post_audit(ab, a); audit_log_end(ab); }
68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <asm/div64.h> #include <linux/reciprocal_div.h> #include <linux/export.h> /* * For a description of the algorithm please have a look at * include/linux/reciprocal_div.h */ struct reciprocal_value reciprocal_value(u32 d) { struct reciprocal_value R; u64 m; int l; l = fls(d - 1); m = ((1ULL << 32) * ((1ULL << l) - d)); do_div(m, d); ++m; R.m = (u32)m; R.sh1 = min(l, 1); R.sh2 = max(l - 1, 0); return R; } EXPORT_SYMBOL(reciprocal_value);
9 7 9 25 25 25 7 7 28 28 28 27 1 25 25 25 25 24 23 4 4 4 3 2 2 2 4 2 1 1 2 4 1 4 9 9 1 1 9 18 4 3 3 18 4 4 4 4 4 4 16 16 25 24 24 24 24 24 25 4 4 1 1 1 1 1 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 /* * File: socket.c * * Phonet sockets * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA */ #include <linux/gfp.h> #include <linux/kernel.h> #include <linux/net.h> #include <linux/poll.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/phonet.h> #include <linux/export.h> #include <net/phonet/phonet.h> #include <net/phonet/pep.h> #include <net/phonet/pn_dev.h> static int pn_socket_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { sock->sk = NULL; sk->sk_prot->close(sk, 0); } return 0; } #define PN_HASHSIZE 16 #define PN_HASHMASK (PN_HASHSIZE-1) static struct { struct hlist_head hlist[PN_HASHSIZE]; struct mutex lock; } pnsocks; void __init pn_sock_init(void) { unsigned int i; for (i = 0; i < PN_HASHSIZE; i++) INIT_HLIST_HEAD(pnsocks.hlist + i); mutex_init(&pnsocks.lock); } static struct hlist_head *pn_hash_list(u16 obj) { return pnsocks.hlist + (obj & PN_HASHMASK); } /* * Find address based on socket address, match only certain fields. * Also grab sock if it was found. Remember to sock_put it later. */ struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *spn) { struct sock *sknode; struct sock *rval = NULL; u16 obj = pn_sockaddr_get_object(spn); u8 res = spn->spn_resource; struct hlist_head *hlist = pn_hash_list(obj); rcu_read_lock(); sk_for_each_rcu(sknode, hlist) { struct pn_sock *pn = pn_sk(sknode); BUG_ON(!pn->sobject); /* unbound socket */ if (!net_eq(sock_net(sknode), net)) continue; if (pn_port(obj)) { /* Look up socket by port */ if (pn_port(pn->sobject) != pn_port(obj)) continue; } else { /* If port is zero, look up by resource */ if (pn->resource != res) continue; } if (pn_addr(pn->sobject) && pn_addr(pn->sobject) != pn_addr(obj)) continue; rval = sknode; sock_hold(sknode); break; } rcu_read_unlock(); return rval; } /* Deliver a broadcast packet (only in bottom-half) */ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb) { struct hlist_head *hlist = pnsocks.hlist; unsigned int h; rcu_read_lock(); for (h = 0; h < PN_HASHSIZE; h++) { struct sock *sknode; sk_for_each(sknode, hlist) { struct sk_buff *clone; if (!net_eq(sock_net(sknode), net)) continue; if (!sock_flag(sknode, SOCK_BROADCAST)) continue; clone = skb_clone(skb, GFP_ATOMIC); if (clone) { sock_hold(sknode); sk_receive_skb(sknode, clone, 0); } } hlist++; } rcu_read_unlock(); } int pn_sock_hash(struct sock *sk) { struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject); mutex_lock(&pnsocks.lock); sk_add_node_rcu(sk, hlist); mutex_unlock(&pnsocks.lock); return 0; } EXPORT_SYMBOL(pn_sock_hash); void pn_sock_unhash(struct sock *sk) { mutex_lock(&pnsocks.lock); sk_del_node_init_rcu(sk); mutex_unlock(&pnsocks.lock); pn_sock_unbind_all_res(sk); synchronize_rcu(); } EXPORT_SYMBOL(pn_sock_unhash); static DEFINE_MUTEX(port_mutex); static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); struct sockaddr_pn *spn = (struct sockaddr_pn *)addr; int err; u16 handle; u8 saddr; if (sk->sk_prot->bind) return sk->sk_prot->bind(sk, addr, len); if (len < sizeof(struct sockaddr_pn)) return -EINVAL; if (spn->spn_family != AF_PHONET) return -EAFNOSUPPORT; handle = pn_sockaddr_get_object((struct sockaddr_pn *)addr); saddr = pn_addr(handle); if (saddr && phonet_address_lookup(sock_net(sk), saddr)) return -EADDRNOTAVAIL; lock_sock(sk); if (sk->sk_state != TCP_CLOSE || pn_port(pn->sobject)) { err = -EINVAL; /* attempt to rebind */ goto out; } WARN_ON(sk_hashed(sk)); mutex_lock(&port_mutex); err = sk->sk_prot->get_port(sk, pn_port(handle)); if (err) goto out_port; /* get_port() sets the port, bind() sets the address if applicable */ pn->sobject = pn_object(saddr, pn_port(pn->sobject)); pn->resource = spn->spn_resource; /* Enable RX on the socket */ err = sk->sk_prot->hash(sk); out_port: mutex_unlock(&port_mutex); out: release_sock(sk); return err; } static int pn_socket_autobind(struct socket *sock) { struct sockaddr_pn sa; int err; memset(&sa, 0, sizeof(sa)); sa.spn_family = AF_PHONET; err = pn_socket_bind(sock, (struct sockaddr *)&sa, sizeof(struct sockaddr_pn)); if (err != -EINVAL) return err; BUG_ON(!pn_port(pn_sk(sock->sk)->sobject)); return 0; /* socket was already bound */ } static int pn_socket_connect(struct socket *sock, struct sockaddr *addr, int len, int flags) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); struct sockaddr_pn *spn = (struct sockaddr_pn *)addr; struct task_struct *tsk = current; long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); int err; if (pn_socket_autobind(sock)) return -ENOBUFS; if (len < sizeof(struct sockaddr_pn)) return -EINVAL; if (spn->spn_family != AF_PHONET) return -EAFNOSUPPORT; lock_sock(sk); switch (sock->state) { case SS_UNCONNECTED: if (sk->sk_state != TCP_CLOSE) { err = -EISCONN; goto out; } break; case SS_CONNECTING: err = -EALREADY; goto out; default: err = -EISCONN; goto out; } pn->dobject = pn_sockaddr_get_object(spn); pn->resource = pn_sockaddr_get_resource(spn); sock->state = SS_CONNECTING; err = sk->sk_prot->connect(sk, addr, len); if (err) { sock->state = SS_UNCONNECTED; pn->dobject = 0; goto out; } while (sk->sk_state == TCP_SYN_SENT) { DEFINE_WAIT(wait); if (!timeo) { err = -EINPROGRESS; goto out; } if (signal_pending(tsk)) { err = sock_intr_errno(timeo); goto out; } prepare_to_wait_exclusive(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); timeo = schedule_timeout(timeo); lock_sock(sk); finish_wait(sk_sleep(sk), &wait); } if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) err = 0; else if (sk->sk_state == TCP_CLOSE_WAIT) err = -ECONNRESET; else err = -ECONNREFUSED; sock->state = err ? SS_UNCONNECTED : SS_CONNECTED; out: release_sock(sk); return err; } static int pn_socket_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; struct sock *newsk; int err; if (unlikely(sk->sk_state != TCP_LISTEN)) return -EINVAL; newsk = sk->sk_prot->accept(sk, flags, &err, kern); if (!newsk) return err; lock_sock(newsk); sock_graft(newsk, newsock); newsock->state = SS_CONNECTED; release_sock(newsk); return 0; } static int pn_socket_getname(struct socket *sock, struct sockaddr *addr, int *sockaddr_len, int peer) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); memset(addr, 0, sizeof(struct sockaddr_pn)); addr->sa_family = AF_PHONET; if (!peer) /* Race with bind() here is userland's problem. */ pn_sockaddr_set_object((struct sockaddr_pn *)addr, pn->sobject); *sockaddr_len = sizeof(struct sockaddr_pn); return 0; } static unsigned int pn_socket_poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; struct pep_sock *pn = pep_sk(sk); unsigned int mask = 0; poll_wait(file, sk_sleep(sk), wait); if (sk->sk_state == TCP_CLOSE) return POLLERR; if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; if (!skb_queue_empty_lockless(&pn->ctrlreq_queue)) mask |= POLLPRI; if (!mask && sk->sk_state == TCP_CLOSE_WAIT) return POLLHUP; if (sk->sk_state == TCP_ESTABLISHED && refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf && atomic_read(&pn->tx_credits)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; } static int pn_socket_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); if (cmd == SIOCPNGETOBJECT) { struct net_device *dev; u16 handle; u8 saddr; if (get_user(handle, (__u16 __user *)arg)) return -EFAULT; lock_sock(sk); if (sk->sk_bound_dev_if) dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if); else dev = phonet_device_get(sock_net(sk)); if (dev && (dev->flags & IFF_UP)) saddr = phonet_address_get(dev, pn_addr(handle)); else saddr = PN_NO_ADDR; release_sock(sk); if (dev) dev_put(dev); if (saddr == PN_NO_ADDR) return -EHOSTUNREACH; handle = pn_object(saddr, pn_port(pn->sobject)); return put_user(handle, (__u16 __user *)arg); } return sk->sk_prot->ioctl(sk, cmd, arg); } static int pn_socket_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = 0; if (pn_socket_autobind(sock)) return -ENOBUFS; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { err = -EINVAL; goto out; } if (sk->sk_state != TCP_LISTEN) { sk->sk_state = TCP_LISTEN; sk->sk_ack_backlog = 0; } sk->sk_max_ack_backlog = backlog; out: release_sock(sk); return err; } static int pn_socket_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { struct sock *sk = sock->sk; if (pn_socket_autobind(sock)) return -EAGAIN; return sk->sk_prot->sendmsg(sk, m, total_len); } const struct proto_ops phonet_dgram_ops = { .family = AF_PHONET, .owner = THIS_MODULE, .release = pn_socket_release, .bind = pn_socket_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pn_socket_getname, .poll = datagram_poll, .ioctl = pn_socket_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, #ifdef CONFIG_COMPAT .compat_setsockopt = sock_no_setsockopt, .compat_getsockopt = sock_no_getsockopt, #endif .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; const struct proto_ops phonet_stream_ops = { .family = AF_PHONET, .owner = THIS_MODULE, .release = pn_socket_release, .bind = pn_socket_bind, .connect = pn_socket_connect, .socketpair = sock_no_socketpair, .accept = pn_socket_accept, .getname = pn_socket_getname, .poll = pn_socket_poll, .ioctl = pn_socket_ioctl, .listen = pn_socket_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif .sendmsg = pn_socket_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; EXPORT_SYMBOL(phonet_stream_ops); /* allocate port for a socket */ int pn_sock_get_port(struct sock *sk, unsigned short sport) { static int port_cur; struct net *net = sock_net(sk); struct pn_sock *pn = pn_sk(sk); struct sockaddr_pn try_sa; struct sock *tmpsk; memset(&try_sa, 0, sizeof(struct sockaddr_pn)); try_sa.spn_family = AF_PHONET; WARN_ON(!mutex_is_locked(&port_mutex)); if (!sport) { /* search free port */ int port, pmin, pmax; phonet_get_local_port_range(&pmin, &pmax); for (port = pmin; port <= pmax; port++) { port_cur++; if (port_cur < pmin || port_cur > pmax) port_cur = pmin; pn_sockaddr_set_port(&try_sa, port_cur); tmpsk = pn_find_sock_by_sa(net, &try_sa); if (tmpsk == NULL) { sport = port_cur; goto found; } else sock_put(tmpsk); } } else { /* try to find specific port */ pn_sockaddr_set_port(&try_sa, sport); tmpsk = pn_find_sock_by_sa(net, &try_sa); if (tmpsk == NULL) /* No sock there! We can use that port... */ goto found; else sock_put(tmpsk); } /* the port must be in use already */ return -EADDRINUSE; found: pn->sobject = pn_object(pn_addr(pn->sobject), sport); return 0; } EXPORT_SYMBOL(pn_sock_get_port); #ifdef CONFIG_PROC_FS static struct sock *pn_sock_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); struct hlist_head *hlist = pnsocks.hlist; struct sock *sknode; unsigned int h; for (h = 0; h < PN_HASHSIZE; h++) { sk_for_each_rcu(sknode, hlist) { if (!net_eq(net, sock_net(sknode))) continue; if (!pos) return sknode; pos--; } hlist++; } return NULL; } static struct sock *pn_sock_get_next(struct seq_file *seq, struct sock *sk) { struct net *net = seq_file_net(seq); do sk = sk_next(sk); while (sk && !net_eq(net, sock_net(sk))); return sk; } static void *pn_sock_seq_start(struct seq_file *seq, loff_t *pos) __acquires(rcu) { rcu_read_lock(); return *pos ? pn_sock_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *pn_sock_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock *sk; if (v == SEQ_START_TOKEN) sk = pn_sock_get_idx(seq, 0); else sk = pn_sock_get_next(seq, v); (*pos)++; return sk; } static void pn_sock_seq_stop(struct seq_file *seq, void *v) __releases(rcu) { rcu_read_unlock(); } static int pn_sock_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 127); if (v == SEQ_START_TOKEN) seq_puts(seq, "pt loc rem rs st tx_queue rx_queue " " uid inode ref pointer drops"); else { struct sock *sk = v; struct pn_sock *pn = pn_sk(sk); seq_printf(seq, "%2d %04X:%04X:%02X %02X %08X:%08X %5d %lu " "%d %pK %d", sk->sk_protocol, pn->sobject, pn->dobject, pn->resource, sk->sk_state, sk_wmem_alloc_get(sk), sk_rmem_alloc_get(sk), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, atomic_read(&sk->sk_drops)); } seq_pad(seq, '\n'); return 0; } static const struct seq_operations pn_sock_seq_ops = { .start = pn_sock_seq_start, .next = pn_sock_seq_next, .stop = pn_sock_seq_stop, .show = pn_sock_seq_show, }; static int pn_sock_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &pn_sock_seq_ops, sizeof(struct seq_net_private)); } const struct file_operations pn_sock_seq_fops = { .owner = THIS_MODULE, .open = pn_sock_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, }; #endif static struct { struct sock *sk[256]; } pnres; /* * Find and hold socket based on resource. */ struct sock *pn_find_sock_by_res(struct net *net, u8 res) { struct sock *sk; if (!net_eq(net, &init_net)) return NULL; rcu_read_lock(); sk = rcu_dereference(pnres.sk[res]); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } static DEFINE_MUTEX(resource_mutex); int pn_sock_bind_res(struct sock *sk, u8 res) { int ret = -EADDRINUSE; if (!net_eq(sock_net(sk), &init_net)) return -ENOIOCTLCMD; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (pn_socket_autobind(sk->sk_socket)) return -EAGAIN; mutex_lock(&resource_mutex); if (pnres.sk[res] == NULL) { sock_hold(sk); rcu_assign_pointer(pnres.sk[res], sk); ret = 0; } mutex_unlock(&resource_mutex); return ret; } int pn_sock_unbind_res(struct sock *sk, u8 res) { int ret = -ENOENT; if (!capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&resource_mutex); if (pnres.sk[res] == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); ret = 0; } mutex_unlock(&resource_mutex); if (ret == 0) { synchronize_rcu(); sock_put(sk); } return ret; } void pn_sock_unbind_all_res(struct sock *sk) { unsigned int res, match = 0; mutex_lock(&resource_mutex); for (res = 0; res < 256; res++) { if (pnres.sk[res] == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); match++; } } mutex_unlock(&resource_mutex); while (match > 0) { __sock_put(sk); match--; } /* Caller is responsible for RCU sync before final sock_put() */ } #ifdef CONFIG_PROC_FS static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); unsigned int i; if (!net_eq(net, &init_net)) return NULL; for (i = 0; i < 256; i++) { if (pnres.sk[i] == NULL) continue; if (!pos) return pnres.sk + i; pos--; } return NULL; } static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk) { struct net *net = seq_file_net(seq); unsigned int i; BUG_ON(!net_eq(net, &init_net)); for (i = (sk - pnres.sk) + 1; i < 256; i++) if (pnres.sk[i]) return pnres.sk + i; return NULL; } static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos) __acquires(resource_mutex) { mutex_lock(&resource_mutex); return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct sock **sk; if (v == SEQ_START_TOKEN) sk = pn_res_get_idx(seq, 0); else sk = pn_res_get_next(seq, v); (*pos)++; return sk; } static void pn_res_seq_stop(struct seq_file *seq, void *v) __releases(resource_mutex) { mutex_unlock(&resource_mutex); } static int pn_res_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 63); if (v == SEQ_START_TOKEN) seq_puts(seq, "rs uid inode"); else { struct sock **psk = v; struct sock *sk = *psk; seq_printf(seq, "%02X %5u %lu", (int) (psk - pnres.sk), from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)), sock_i_ino(sk)); } seq_pad(seq, '\n'); return 0; } static const struct seq_operations pn_res_seq_ops = { .start = pn_res_seq_start, .next = pn_res_seq_next, .stop = pn_res_seq_stop, .show = pn_res_seq_show, }; static int pn_res_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &pn_res_seq_ops, sizeof(struct seq_net_private)); } const struct file_operations pn_res_seq_fops = { .owner = THIS_MODULE, .open = pn_res_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, }; #endif
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 /* Copyright (c) 2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ #include <linux/slab.h> #include <linux/bpf.h> #include "map_in_map.h" struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; u32 inner_map_meta_size; struct fd f; f = fdget(inner_map_ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; /* prog_array->owner_prog_type and owner_jited * is a runtime binding. Doing static check alone * in the verifier is not enough. */ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { fdput(f); return ERR_PTR(-ENOTSUPP); } /* Does not support >1 level map-in-map */ if (inner_map->inner_map_meta) { fdput(f); return ERR_PTR(-EINVAL); } inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops) inner_map_meta_size = sizeof(struct bpf_array); inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { fdput(f); return ERR_PTR(-ENOMEM); } inner_map_meta->map_type = inner_map->map_type; inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; inner_map_meta->max_entries = inner_map->max_entries; /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { inner_map_meta->unpriv_array = inner_map->unpriv_array; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } fdput(f); return inner_map_meta; } void bpf_map_meta_free(struct bpf_map *map_meta) { kfree(map_meta); } bool bpf_map_meta_equal(const struct bpf_map *meta0, const struct bpf_map *meta1) { /* No need to compare ops because it is covered by map_type */ return meta0->map_type == meta1->map_type && meta0->key_size == meta1->key_size && meta0->value_size == meta1->value_size && meta0->map_flags == meta1->map_flags && meta0->max_entries == meta1->max_entries; } void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, int ufd) { struct bpf_map *inner_map; struct fd f; f = fdget(ufd); inner_map = __bpf_map_get(f); if (IS_ERR(inner_map)) return inner_map; if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) inner_map = bpf_map_inc(inner_map, false); else inner_map = ERR_PTR(-EINVAL); fdput(f); return inner_map; } void bpf_map_fd_put_ptr(void *ptr) { /* ptr->ops->map_free() has to go through one * rcu grace period by itself. */ bpf_map_put(ptr); } u32 bpf_map_fd_sys_lookup_elem(void *ptr) { return ((struct bpf_map *)ptr)->id; }
3 3 3 3 3 3 2 2 1 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 /* * ebt_ip6 * * Authors: * Manohar Castelino <manohar.r.castelino@intel.com> * Kuo-Lang Tseng <kuo-lang.tseng@intel.com> * Jan Engelhardt <jengelh@medozas.de> * * Summary: * This is just a modification of the IPv4 code written by * Bart De Schuymer <bdschuym@pandora.be> * with the changes required to support IPv6 * * Jan, 2008 */ #include <linux/ipv6.h> #include <net/ipv6.h> #include <linux/in.h> #include <linux/module.h> #include <net/dsfield.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> #include <linux/netfilter_bridge/ebt_ip6.h> union pkthdr { struct { __be16 src; __be16 dst; } tcpudphdr; struct { u8 type; u8 code; } icmphdr; }; static bool ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_ip6_info *info = par->matchinfo; const struct ipv6hdr *ih6; struct ipv6hdr _ip6h; const union pkthdr *pptr; union pkthdr _pkthdr; ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); if (ih6 == NULL) return false; if ((info->bitmask & EBT_IP6_TCLASS) && NF_INVF(info, EBT_IP6_TCLASS, info->tclass != ipv6_get_dsfield(ih6))) return false; if (((info->bitmask & EBT_IP6_SOURCE) && NF_INVF(info, EBT_IP6_SOURCE, ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk, &info->saddr))) || ((info->bitmask & EBT_IP6_DEST) && NF_INVF(info, EBT_IP6_DEST, ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk, &info->daddr)))) return false; if (info->bitmask & EBT_IP6_PROTO) { uint8_t nexthdr = ih6->nexthdr; __be16 frag_off; int offset_ph; offset_ph = ipv6_skip_exthdr(skb, sizeof(_ip6h), &nexthdr, &frag_off); if (offset_ph == -1) return false; if (NF_INVF(info, EBT_IP6_PROTO, info->protocol != nexthdr)) return false; if (!(info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT | EBT_IP6_ICMP6))) return true; /* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */ pptr = skb_header_pointer(skb, offset_ph, sizeof(_pkthdr), &_pkthdr); if (pptr == NULL) return false; if (info->bitmask & EBT_IP6_DPORT) { u16 dst = ntohs(pptr->tcpudphdr.dst); if (NF_INVF(info, EBT_IP6_DPORT, dst < info->dport[0] || dst > info->dport[1])) return false; } if (info->bitmask & EBT_IP6_SPORT) { u16 src = ntohs(pptr->tcpudphdr.src); if (NF_INVF(info, EBT_IP6_SPORT, src < info->sport[0] || src > info->sport[1])) return false; } if ((info->bitmask & EBT_IP6_ICMP6) && NF_INVF(info, EBT_IP6_ICMP6, pptr->icmphdr.type < info->icmpv6_type[0] || pptr->icmphdr.type > info->icmpv6_type[1] || pptr->icmphdr.code < info->icmpv6_code[0] || pptr->icmphdr.code > info->icmpv6_code[1])) return false; } return true; } static int ebt_ip6_mt_check(const struct xt_mtchk_param *par) { const struct ebt_entry *e = par->entryinfo; struct ebt_ip6_info *info = par->matchinfo; if (e->ethproto != htons(ETH_P_IPV6) || e->invflags & EBT_IPROTO) return -EINVAL; if (info->bitmask & ~EBT_IP6_MASK || info->invflags & ~EBT_IP6_MASK) return -EINVAL; if (info->bitmask & (EBT_IP6_DPORT | EBT_IP6_SPORT)) { if (info->invflags & EBT_IP6_PROTO) return -EINVAL; if (info->protocol != IPPROTO_TCP && info->protocol != IPPROTO_UDP && info->protocol != IPPROTO_UDPLITE && info->protocol != IPPROTO_SCTP && info->protocol != IPPROTO_DCCP) return -EINVAL; } if (info->bitmask & EBT_IP6_DPORT && info->dport[0] > info->dport[1]) return -EINVAL; if (info->bitmask & EBT_IP6_SPORT && info->sport[0] > info->sport[1]) return -EINVAL; if (info->bitmask & EBT_IP6_ICMP6) { if ((info->invflags & EBT_IP6_PROTO) || info->protocol != IPPROTO_ICMPV6) return -EINVAL; if (info->icmpv6_type[0] > info->icmpv6_type[1] || info->icmpv6_code[0] > info->icmpv6_code[1]) return -EINVAL; } return 0; } static struct xt_match ebt_ip6_mt_reg __read_mostly = { .name = "ip6", .revision = 0, .family = NFPROTO_BRIDGE, .match = ebt_ip6_mt, .checkentry = ebt_ip6_mt_check, .matchsize = sizeof(struct ebt_ip6_info), .me = THIS_MODULE, }; static int __init ebt_ip6_init(void) { return xt_register_match(&ebt_ip6_mt_reg); } static void __exit ebt_ip6_fini(void) { xt_unregister_match(&ebt_ip6_mt_reg); } module_init(ebt_ip6_init); module_exit(ebt_ip6_fini); MODULE_DESCRIPTION("Ebtables: IPv6 protocol packet match"); MODULE_AUTHOR("Kuo-Lang Tseng <kuo-lang.tseng@intel.com>"); MODULE_LICENSE("GPL");
1 1 1 1 1 374 374 374 306 368 368 374 374 79 2 79 15 133 26 76 76 305 344 344 337 92 168 276 275 276 270 27 5 2 19 130 402 400 393 28 402 268 268 268 187 268 187 267 187 120 120 120 110 12 12 11 2 10 11 268 108 268 268 183 15 15 15 9 392 510 507 51 495 292 293 281 578 545 77 20 20 68 68 68 68 77 1 617 459 561 1 2 150 7 6 5 2 112 137 125 175 175 176 67 116 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * Initialization/cleanup for SCTP protocol support. * * This SCTP implementation is free software; * you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This SCTP implementation is distributed in the hope that it * will be useful, but WITHOUT ANY WARRANTY; without even the implied * ************************ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU CC; see the file COPYING. If not, see * <http://www.gnu.org/licenses/>. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/seq_file.h> #include <linux/bootmem.h> #include <linux/highmem.h> #include <linux/swap.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/route.h> #include <net/sctp/sctp.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024) /* Global data structures. */ struct sctp_globals sctp_globals __read_mostly; struct idr sctp_assocs_id; DEFINE_SPINLOCK(sctp_assocs_id_lock); static struct sctp_pf *sctp_pf_inet6_specific; static struct sctp_pf *sctp_pf_inet_specific; static struct sctp_af *sctp_af_v4_specific; static struct sctp_af *sctp_af_v6_specific; struct kmem_cache *sctp_chunk_cachep __read_mostly; struct kmem_cache *sctp_bucket_cachep __read_mostly; long sysctl_sctp_mem[3]; int sysctl_sctp_rmem[3]; int sysctl_sctp_wmem[3]; /* Set up the proc fs entry for the SCTP protocol. */ static int __net_init sctp_proc_init(struct net *net) { #ifdef CONFIG_PROC_FS net->sctp.proc_net_sctp = proc_net_mkdir(net, "sctp", net->proc_net); if (!net->sctp.proc_net_sctp) goto out_proc_net_sctp; if (sctp_snmp_proc_init(net)) goto out_snmp_proc_init; if (sctp_eps_proc_init(net)) goto out_eps_proc_init; if (sctp_assocs_proc_init(net)) goto out_assocs_proc_init; if (sctp_remaddr_proc_init(net)) goto out_remaddr_proc_init; return 0; out_remaddr_proc_init: sctp_assocs_proc_exit(net); out_assocs_proc_init: sctp_eps_proc_exit(net); out_eps_proc_init: sctp_snmp_proc_exit(net); out_snmp_proc_init: remove_proc_entry("sctp", net->proc_net); net->sctp.proc_net_sctp = NULL; out_proc_net_sctp: return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } /* Clean up the proc fs entry for the SCTP protocol. * Note: Do not make this __exit as it is used in the init error * path. */ static void sctp_proc_exit(struct net *net) { #ifdef CONFIG_PROC_FS sctp_snmp_proc_exit(net); sctp_eps_proc_exit(net); sctp_assocs_proc_exit(net); sctp_remaddr_proc_exit(net); remove_proc_entry("sctp", net->proc_net); net->sctp.proc_net_sctp = NULL; #endif } /* Private helper to extract ipv4 address and stash them in * the protocol structure. */ static void sctp_v4_copy_addrlist(struct list_head *addrlist, struct net_device *dev) { struct in_device *in_dev; struct in_ifaddr *ifa; struct sctp_sockaddr_entry *addr; rcu_read_lock(); if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { rcu_read_unlock(); return; } for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; addr->valid = 1; INIT_LIST_HEAD(&addr->list); list_add_tail(&addr->list, addrlist); } } rcu_read_unlock(); } /* Extract our IP addresses from the system and stash them in the * protocol structure. */ static void sctp_get_local_addr_list(struct net *net) { struct net_device *dev; struct list_head *pos; struct sctp_af *af; rcu_read_lock(); for_each_netdev_rcu(net, dev) { list_for_each(pos, &sctp_address_families) { af = list_entry(pos, struct sctp_af, list); af->copy_addrlist(&net->sctp.local_addr_list, dev); } } rcu_read_unlock(); } /* Free the existing local addresses. */ static void sctp_free_local_addr_list(struct net *net) { struct sctp_sockaddr_entry *addr; struct list_head *pos, *temp; list_for_each_safe(pos, temp, &net->sctp.local_addr_list) { addr = list_entry(pos, struct sctp_sockaddr_entry, list); list_del(pos); kfree(addr); } } /* Copy the local addresses which are valid for 'scope' into 'bp'. */ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp, enum sctp_scope scope, gfp_t gfp, int copy_flags) { struct sctp_sockaddr_entry *addr; union sctp_addr laddr; int error = 0; rcu_read_lock(); list_for_each_entry_rcu(addr, &net->sctp.local_addr_list, list) { if (!addr->valid) continue; if (!sctp_in_scope(net, &addr->a, scope)) continue; /* Now that the address is in scope, check to see if * the address type is really supported by the local * sock as well as the remote peer. */ if (addr->a.sa.sa_family == AF_INET && (!(copy_flags & SCTP_ADDR4_ALLOWED) || !(copy_flags & SCTP_ADDR4_PEERSUPP))) continue; if (addr->a.sa.sa_family == AF_INET6 && (!(copy_flags & SCTP_ADDR6_ALLOWED) || !(copy_flags & SCTP_ADDR6_PEERSUPP))) continue; laddr = addr->a; /* also works for setting ipv6 address port */ laddr.v4.sin_port = htons(bp->port); if (sctp_bind_addr_state(bp, &laddr) != -1) continue; error = sctp_add_bind_addr(bp, &addr->a, sizeof(addr->a), SCTP_ADDR_SRC, GFP_ATOMIC); if (error) break; } rcu_read_unlock(); return error; } /* Initialize a sctp_addr from in incoming skb. */ static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { /* Always called on head skb, so this is safe */ struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in *sa = &addr->v4; addr->v4.sin_family = AF_INET; if (is_saddr) { sa->sin_port = sh->source; sa->sin_addr.s_addr = ip_hdr(skb)->saddr; } else { sa->sin_port = sh->dest; sa->sin_addr.s_addr = ip_hdr(skb)->daddr; } memset(sa->sin_zero, 0, sizeof(sa->sin_zero)); } /* Initialize an sctp_addr from a socket. */ static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = 0; addr->v4.sin_addr.s_addr = inet_sk(sk)->inet_rcv_saddr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk) { inet_sk(sk)->inet_rcv_saddr = addr->v4.sin_addr.s_addr; } /* Initialize sk->sk_daddr from sctp_addr. */ static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) { inet_sk(sk)->inet_daddr = addr->v4.sin_addr.s_addr; } /* Initialize a sctp_addr from an address parameter. */ static bool sctp_v4_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { if (ntohs(param->v4.param_hdr.length) < sizeof(struct sctp_ipv4addr_param)) return false; addr->v4.sin_family = AF_INET; addr->v4.sin_port = port; addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return true; } /* Initialize an address parameter from a sctp_addr and return the length * of the address parameter. */ static int sctp_v4_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { int length = sizeof(struct sctp_ipv4addr_param); param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS; param->v4.param_hdr.length = htons(length); param->v4.addr.s_addr = addr->v4.sin_addr.s_addr; return length; } /* Initialize a sctp_addr from a dst_entry. */ static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct flowi4 *fl4, __be16 port) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_port = port; saddr->v4.sin_addr.s_addr = fl4->saddr; memset(saddr->v4.sin_zero, 0, sizeof(saddr->v4.sin_zero)); } /* Compare two addresses exactly. */ static int sctp_v4_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { if (addr1->sa.sa_family != addr2->sa.sa_family) return 0; if (addr1->v4.sin_port != addr2->v4.sin_port) return 0; if (addr1->v4.sin_addr.s_addr != addr2->v4.sin_addr.s_addr) return 0; return 1; } /* Initialize addr struct to INADDR_ANY. */ static void sctp_v4_inaddr_any(union sctp_addr *addr, __be16 port) { addr->v4.sin_family = AF_INET; addr->v4.sin_addr.s_addr = htonl(INADDR_ANY); addr->v4.sin_port = port; memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); } /* Is this a wildcard address? */ static int sctp_v4_is_any(const union sctp_addr *addr) { return htonl(INADDR_ANY) == addr->v4.sin_addr.s_addr; } /* This function checks if the address is a valid address to be used for * SCTP binding. * * Output: * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp, const struct sk_buff *skb) { /* IPv4 addresses not allowed */ if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; /* Is this a non-unicast address or a unusable SCTP address? */ if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) return 0; /* Is this a broadcast address? */ if (skb && skb_rtable(skb)->rt_flags & RTCF_BROADCAST) return 0; return 1; } /* Should this be available for binding? */ static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) { struct net *net = sock_net(&sp->inet.sk); int ret = inet_addr_type(net, addr->v4.sin_addr.s_addr); if (addr->v4.sin_addr.s_addr != htonl(INADDR_ANY) && ret != RTN_LOCAL && !sp->inet.freebind && !net->ipv4.sysctl_ip_nonlocal_bind) return 0; if (ipv6_only_sock(sctp_opt2sk(sp))) return 0; return 1; } /* Checking the loopback, private and other address scopes as defined in * RFC 1918. The IPv4 scoping is based on the draft for SCTP IPv4 * scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>. * * Level 0 - unusable SCTP addresses * Level 1 - loopback address * Level 2 - link-local addresses * Level 3 - private addresses. * Level 4 - global addresses * For INIT and INIT-ACK address list, let L be the level of * of requested destination address, sender and receiver * SHOULD include all of its addresses with level greater * than or equal to L. * * IPv4 scoping can be controlled through sysctl option * net.sctp.addr_scope_policy */ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr) { enum sctp_scope retval; /* Check for unusable SCTP addresses. */ if (IS_IPV4_UNUSABLE_ADDRESS(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_UNUSABLE; } else if (ipv4_is_loopback(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_LOOPBACK; } else if (ipv4_is_linklocal_169(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_LINK; } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || ipv4_is_private_172(addr->v4.sin_addr.s_addr) || ipv4_is_private_192(addr->v4.sin_addr.s_addr) || ipv4_is_test_198(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_PRIVATE; } else { retval = SCTP_SCOPE_GLOBAL; } return retval; } /* Returns a valid dst cache entry for the given source and destination ip * addresses. If an association is passed, trys to get a dst entry with a * source address that matches an address in the bind address list. */ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk) { struct sctp_association *asoc = t->asoc; struct rtable *rt; struct flowi _fl; struct flowi4 *fl4 = &_fl.u.ip4; struct sctp_bind_addr *bp; struct sctp_sockaddr_entry *laddr; struct dst_entry *dst = NULL; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; memset(&_fl, 0x0, sizeof(_fl)); fl4->daddr = daddr->v4.sin_addr.s_addr; fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { fl4->flowi4_tos = RT_CONN_FLAGS(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); } if (saddr) { fl4->saddr = saddr->v4.sin_addr.s_addr; fl4->fl4_sport = saddr->v4.sin_port; } pr_debug("%s: dst:%pI4, src:%pI4 - ", __func__, &fl4->daddr, &fl4->saddr); rt = ip_route_output_key(sock_net(sk), fl4); if (!IS_ERR(rt)) { dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } /* If there is no association or if a source address is passed, no * more validation is required. */ if (!asoc || saddr) goto out; bp = &asoc->base.bind_addr; if (dst) { /* Walk through the bind address list and look for a bind * address that matches the source address of the returned dst. */ sctp_v4_dst_saddr(&dst_saddr, fl4, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid || (laddr->state == SCTP_ADDR_DEL) || (laddr->state != SCTP_ADDR_SRC && !asoc->src_out_of_asoc_ok)) continue; if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a)) goto out_unlock; } rcu_read_unlock(); /* None of the bound addresses match the source address of the * dst. So release it. */ dst_release(dst); dst = NULL; } /* Walk through the bind address list and try to get a dst that * matches a bind address as the source address. */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { struct net_device *odev; if (!laddr->valid) continue; if (laddr->state != SCTP_ADDR_SRC || AF_INET != laddr->a.sa.sa_family) continue; fl4->fl4_sport = laddr->a.v4.sin_port; flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, RT_CONN_FLAGS(asoc->base.sk), daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); rt = ip_route_output_key(sock_net(sk), fl4); if (IS_ERR(rt)) continue; /* Ensure the src address belongs to the output * interface. */ odev = __ip_dev_find(sock_net(sk), laddr->a.v4.sin_addr.s_addr, false); if (!odev || odev->ifindex != fl4->flowi4_oif) { if (!dst) { dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } else { dst_release(&rt->dst); } continue; } dst_release(dst); dst = &rt->dst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); break; } out_unlock: rcu_read_unlock(); out: if (dst) { pr_debug("rt_dst:%pI4, rt_src:%pI4\n", &fl->u.ip4.daddr, &fl->u.ip4.saddr); } else { t->dst = NULL; pr_debug("no route\n"); } } /* For v4, the source address is cached in the route entry(dst). So no need * to cache it separately and hence this is an empty routine. */ static void sctp_v4_get_saddr(struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl) { union sctp_addr *saddr = &t->saddr; struct rtable *rt = (struct rtable *)t->dst; if (rt) { saddr->v4.sin_family = AF_INET; saddr->v4.sin_addr.s_addr = fl->u.ip4.saddr; } } /* What interface did this skb arrive on? */ static int sctp_v4_skb_iif(const struct sk_buff *skb) { return inet_iif(skb); } /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v4_is_ce(const struct sk_buff *skb) { return INET_ECN_is_ce(ip_hdr(skb)->tos); } /* Create and initialize a new sk for the socket returned by accept(). */ static struct sock *sctp_v4_create_accept_sk(struct sock *sk, struct sctp_association *asoc, bool kern) { struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, sk->sk_prot, kern); struct inet_sock *newinet; if (!newsk) goto out; sock_init_data(NULL, newsk); sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(newsk, SOCK_ZAPPED); newinet = inet_sk(newsk); newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; } out: return newsk; } static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { /* No address mapping for V4 sockets */ memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); } /* Dump the v4 addr to the seq file. */ static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { seq_printf(seq, "%pI4 ", &addr->v4.sin_addr); } static void sctp_v4_ecn_capable(struct sock *sk) { INET_ECN_xmit(sk); } static void sctp_addr_wq_timeout_handler(unsigned long arg) { struct net *net = (struct net *)arg; struct sctp_sockaddr_entry *addrw, *temp; struct sctp_sock *sp; spin_lock_bh(&net->sctp.addr_wq_lock); list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) { pr_debug("%s: the first ent in wq:%p is addr:%pISc for cmd:%d at " "entry:%p\n", __func__, &net->sctp.addr_waitq, &addrw->a.sa, addrw->state, addrw); #if IS_ENABLED(CONFIG_IPV6) /* Now we send an ASCONF for each association */ /* Note. we currently don't handle link local IPv6 addressees */ if (addrw->a.sa.sa_family == AF_INET6) { struct in6_addr *in6; if (ipv6_addr_type(&addrw->a.v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) goto free_next; in6 = (struct in6_addr *)&addrw->a.v6.sin6_addr; if (ipv6_chk_addr(net, in6, NULL, 0) == 0 && addrw->state == SCTP_ADDR_NEW) { unsigned long timeo_val; pr_debug("%s: this is on DAD, trying %d sec " "later\n", __func__, SCTP_ADDRESS_TICK_DELAY); timeo_val = jiffies; timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY); mod_timer(&net->sctp.addr_wq_timer, timeo_val); break; } } #endif list_for_each_entry(sp, &net->sctp.auto_asconf_splist, auto_asconf_list) { struct sock *sk; sk = sctp_opt2sk(sp); /* ignore bound-specific endpoints */ if (!sctp_is_ep_boundall(sk)) continue; bh_lock_sock(sk); if (sctp_asconf_mgmt(sp, addrw) < 0) pr_debug("%s: sctp_asconf_mgmt failed\n", __func__); bh_unlock_sock(sk); } #if IS_ENABLED(CONFIG_IPV6) free_next: #endif list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); } static void sctp_free_addr_wq(struct net *net) { struct sctp_sockaddr_entry *addrw; struct sctp_sockaddr_entry *temp; spin_lock_bh(&net->sctp.addr_wq_lock); del_timer(&net->sctp.addr_wq_timer); list_for_each_entry_safe(addrw, temp, &net->sctp.addr_waitq, list) { list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); } /* lookup the entry for the same address in the addr_waitq * sctp_addr_wq MUST be locked */ static struct sctp_sockaddr_entry *sctp_addr_wq_lookup(struct net *net, struct sctp_sockaddr_entry *addr) { struct sctp_sockaddr_entry *addrw; list_for_each_entry(addrw, &net->sctp.addr_waitq, list) { if (addrw->a.sa.sa_family != addr->a.sa.sa_family) continue; if (addrw->a.sa.sa_family == AF_INET) { if (addrw->a.v4.sin_addr.s_addr == addr->a.v4.sin_addr.s_addr) return addrw; } else if (addrw->a.sa.sa_family == AF_INET6) { if (ipv6_addr_equal(&addrw->a.v6.sin6_addr, &addr->a.v6.sin6_addr)) return addrw; } } return NULL; } void sctp_addr_wq_mgmt(struct net *net, struct sctp_sockaddr_entry *addr, int cmd) { struct sctp_sockaddr_entry *addrw; unsigned long timeo_val; /* first, we check if an opposite message already exist in the queue. * If we found such message, it is removed. * This operation is a bit stupid, but the DHCP client attaches the * new address after a couple of addition and deletion of that address */ spin_lock_bh(&net->sctp.addr_wq_lock); /* Offsets existing events in addr_wq */ addrw = sctp_addr_wq_lookup(net, addr); if (addrw) { if (addrw->state != cmd) { pr_debug("%s: offsets existing entry for %d, addr:%pISc " "in wq:%p\n", __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq); list_del(&addrw->list); kfree(addrw); } spin_unlock_bh(&net->sctp.addr_wq_lock); return; } /* OK, we have to add the new address to the wait queue */ addrw = kmemdup(addr, sizeof(struct sctp_sockaddr_entry), GFP_ATOMIC); if (addrw == NULL) { spin_unlock_bh(&net->sctp.addr_wq_lock); return; } addrw->state = cmd; list_add_tail(&addrw->list, &net->sctp.addr_waitq); pr_debug("%s: add new entry for cmd:%d, addr:%pISc in wq:%p\n", __func__, addrw->state, &addrw->a.sa, &net->sctp.addr_waitq); if (!timer_pending(&net->sctp.addr_wq_timer)) { timeo_val = jiffies; timeo_val += msecs_to_jiffies(SCTP_ADDRESS_TICK_DELAY); mod_timer(&net->sctp.addr_wq_timer, timeo_val); } spin_unlock_bh(&net->sctp.addr_wq_lock); } /* Event handler for inet address addition/deletion events. * The sctp_local_addr_list needs to be protocted by a spin lock since * multiple notifiers (say IPv4 and IPv6) may be running at the same * time and thus corrupt the list. * The reader side is protected with RCU. */ static int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, void *ptr) { struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sctp_sockaddr_entry *addr = NULL; struct sctp_sockaddr_entry *temp; struct net *net = dev_net(ifa->ifa_dev->dev); int found = 0; switch (ev) { case NETDEV_UP: addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v4.sin_family = AF_INET; addr->a.v4.sin_addr.s_addr = ifa->ifa_local; addr->valid = 1; spin_lock_bh(&net->sctp.local_addr_lock); list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW); spin_unlock_bh(&net->sctp.local_addr_lock); } break; case NETDEV_DOWN: spin_lock_bh(&net->sctp.local_addr_lock); list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET && addr->a.v4.sin_addr.s_addr == ifa->ifa_local) { sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } spin_unlock_bh(&net->sctp.local_addr_lock); if (found) kfree_rcu(addr, rcu); break; } return NOTIFY_DONE; } /* * Initialize the control inode/socket with a control endpoint data * structure. This endpoint is reserved exclusively for the OOTB processing. */ static int sctp_ctl_sock_init(struct net *net) { int err; sa_family_t family = PF_INET; if (sctp_get_pf_specific(PF_INET6)) family = PF_INET6; err = inet_ctl_sock_create(&net->sctp.ctl_sock, family, SOCK_SEQPACKET, IPPROTO_SCTP, net); /* If IPv6 socket could not be created, try the IPv4 socket */ if (err < 0 && family == PF_INET6) err = inet_ctl_sock_create(&net->sctp.ctl_sock, AF_INET, SOCK_SEQPACKET, IPPROTO_SCTP, net); if (err < 0) { pr_err("Failed to create the SCTP control socket\n"); return err; } return 0; } /* Register address family specific functions. */ int sctp_register_af(struct sctp_af *af) { switch (af->sa_family) { case AF_INET: if (sctp_af_v4_specific) return 0; sctp_af_v4_specific = af; break; case AF_INET6: if (sctp_af_v6_specific) return 0; sctp_af_v6_specific = af; break; default: return 0; } INIT_LIST_HEAD(&af->list); list_add_tail(&af->list, &sctp_address_families); return 1; } /* Get the table of functions for manipulating a particular address * family. */ struct sctp_af *sctp_get_af_specific(sa_family_t family) { switch (family) { case AF_INET: return sctp_af_v4_specific; case AF_INET6: return sctp_af_v6_specific; default: return NULL; } } /* Common code to initialize a AF_INET msg_name. */ static void sctp_inet_msgname(char *msgname, int *addr_len) { struct sockaddr_in *sin; sin = (struct sockaddr_in *)msgname; *addr_len = sizeof(struct sockaddr_in); sin->sin_family = AF_INET; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); } /* Copy the primary address of the peer primary address as the msg_name. */ static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname, int *addr_len) { struct sockaddr_in *sin, *sinfrom; if (msgname) { struct sctp_association *asoc; asoc = event->asoc; sctp_inet_msgname(msgname, addr_len); sin = (struct sockaddr_in *)msgname; sinfrom = &asoc->peer.primary_addr.v4; sin->sin_port = htons(asoc->peer.port); sin->sin_addr.s_addr = sinfrom->sin_addr.s_addr; } } /* Initialize and copy out a msgname from an inbound skb. */ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len) { if (msgname) { struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in *sin = (struct sockaddr_in *)msgname; sctp_inet_msgname(msgname, len); sin->sin_port = sh->source; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; } } /* Do we support this AF? */ static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp) { /* PF_INET only supports AF_INET addresses. */ return AF_INET == family; } /* Address matching with wildcards allowed. */ static int sctp_inet_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2, struct sctp_sock *opt) { /* PF_INET only supports AF_INET addresses. */ if (addr1->sa.sa_family != addr2->sa.sa_family) return 0; if (htonl(INADDR_ANY) == addr1->v4.sin_addr.s_addr || htonl(INADDR_ANY) == addr2->v4.sin_addr.s_addr) return 1; if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr) return 1; return 0; } /* Verify that provided sockaddr looks bindable. Common verification has * already been taken care of. */ static int sctp_inet_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) { return sctp_v4_available(addr, opt); } /* Verify that sockaddr looks sendable. Common verification has already * been taken care of. */ static int sctp_inet_send_verify(struct sctp_sock *opt, union sctp_addr *addr) { return 1; } /* Fill in Supported Address Type information for INIT and INIT-ACK * chunks. Returns number of addresses supported. */ static int sctp_inet_supported_addrs(const struct sctp_sock *opt, __be16 *types) { types[0] = SCTP_PARAM_IPV4_ADDRESS; return 1; } /* Wrapper routine that calls the ip transmit routine. */ static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *transport) { struct inet_sock *inet = inet_sk(skb->sk); pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb, skb->len, &transport->fl.u.ip4.saddr, &transport->fl.u.ip4.daddr); inet->pmtudisc = transport->param_flags & SPP_PMTUD_ENABLE ? IP_PMTUDISC_DO : IP_PMTUDISC_DONT; SCTP_INC_STATS(sock_net(&inet->sk), SCTP_MIB_OUTSCTPPACKS); return ip_queue_xmit(&inet->sk, skb, &transport->fl); } static struct sctp_af sctp_af_inet; static struct sctp_pf sctp_pf_inet = { .event_msgname = sctp_inet_event_msgname, .skb_msgname = sctp_inet_skb_msgname, .af_supported = sctp_inet_af_supported, .cmp_addr = sctp_inet_cmp_addr, .bind_verify = sctp_inet_bind_verify, .send_verify = sctp_inet_send_verify, .supported_addrs = sctp_inet_supported_addrs, .create_accept_sk = sctp_v4_create_accept_sk, .addr_to_user = sctp_v4_addr_to_user, .to_sk_saddr = sctp_v4_to_sk_saddr, .to_sk_daddr = sctp_v4_to_sk_daddr, .af = &sctp_af_inet }; /* Notifier for inetaddr addition/deletion events. */ static struct notifier_block sctp_inetaddr_notifier = { .notifier_call = sctp_inetaddr_event, }; /* Socket operations. */ static const struct proto_ops inet_seqpacket_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, /* Needs to be wrapped... */ .bind = inet_bind, .connect = sctp_inet_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, /* Semantics are different. */ .poll = sctp_poll, .ioctl = inet_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, /* Looks harmless. */ .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem */ .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif }; /* Registration with AF_INET family. */ static struct inet_protosw sctp_seqpacket_protosw = { .type = SOCK_SEQPACKET, .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctp_stream_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SCTP, .prot = &sctp_prot, .ops = &inet_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; /* Register with IP layer. */ static const struct net_protocol sctp_protocol = { .handler = sctp_rcv, .err_handler = sctp_v4_err, .no_policy = 1, .netns_ok = 1, .icmp_strict_tag_validation = 1, }; /* IPv4 address related functions. */ static struct sctp_af sctp_af_inet = { .sa_family = AF_INET, .sctp_xmit = sctp_v4_xmit, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .get_dst = sctp_v4_get_dst, .get_saddr = sctp_v4_get_saddr, .copy_addrlist = sctp_v4_copy_addrlist, .from_skb = sctp_v4_from_skb, .from_sk = sctp_v4_from_sk, .from_addr_param = sctp_v4_from_addr_param, .to_addr_param = sctp_v4_to_addr_param, .cmp_addr = sctp_v4_cmp_addr, .addr_valid = sctp_v4_addr_valid, .inaddr_any = sctp_v4_inaddr_any, .is_any = sctp_v4_is_any, .available = sctp_v4_available, .scope = sctp_v4_scope, .skb_iif = sctp_v4_skb_iif, .is_ce = sctp_v4_is_ce, .seq_dump_addr = sctp_v4_seq_dump_addr, .ecn_capable = sctp_v4_ecn_capable, .net_header_len = sizeof(struct iphdr), .sockaddr_len = sizeof(struct sockaddr_in), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, #endif }; struct sctp_pf *sctp_get_pf_specific(sa_family_t family) { switch (family) { case PF_INET: return sctp_pf_inet_specific; case PF_INET6: return sctp_pf_inet6_specific; default: return NULL; } } /* Register the PF specific function table. */ int sctp_register_pf(struct sctp_pf *pf, sa_family_t family) { switch (family) { case PF_INET: if (sctp_pf_inet_specific) return 0; sctp_pf_inet_specific = pf; break; case PF_INET6: if (sctp_pf_inet6_specific) return 0; sctp_pf_inet6_specific = pf; break; default: return 0; } return 1; } static inline int init_sctp_mibs(struct net *net) { net->sctp.sctp_statistics = alloc_percpu(struct sctp_mib); if (!net->sctp.sctp_statistics) return -ENOMEM; return 0; } static inline void cleanup_sctp_mibs(struct net *net) { free_percpu(net->sctp.sctp_statistics); } static void sctp_v4_pf_init(void) { /* Initialize the SCTP specific PF functions. */ sctp_register_pf(&sctp_pf_inet, PF_INET); sctp_register_af(&sctp_af_inet); } static void sctp_v4_pf_exit(void) { list_del(&sctp_af_inet.list); } static int sctp_v4_protosw_init(void) { int rc; rc = proto_register(&sctp_prot, 1); if (rc) return rc; /* Register SCTP(UDP and TCP style) with socket layer. */ inet_register_protosw(&sctp_seqpacket_protosw); inet_register_protosw(&sctp_stream_protosw); return 0; } static void sctp_v4_protosw_exit(void) { inet_unregister_protosw(&sctp_stream_protosw); inet_unregister_protosw(&sctp_seqpacket_protosw); proto_unregister(&sctp_prot); } static int sctp_v4_add_protocol(void) { /* Register notifier for inet address additions/deletions. */ register_inetaddr_notifier(&sctp_inetaddr_notifier); /* Register SCTP with inet layer. */ if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) return -EAGAIN; return 0; } static void sctp_v4_del_protocol(void) { inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); unregister_inetaddr_notifier(&sctp_inetaddr_notifier); } static int __net_init sctp_defaults_init(struct net *net) { int status; /* * 14. Suggested SCTP Protocol Parameter Values */ /* The following protocol parameters are RECOMMENDED: */ /* RTO.Initial - 3 seconds */ net->sctp.rto_initial = SCTP_RTO_INITIAL; /* RTO.Min - 1 second */ net->sctp.rto_min = SCTP_RTO_MIN; /* RTO.Max - 60 seconds */ net->sctp.rto_max = SCTP_RTO_MAX; /* RTO.Alpha - 1/8 */ net->sctp.rto_alpha = SCTP_RTO_ALPHA; /* RTO.Beta - 1/4 */ net->sctp.rto_beta = SCTP_RTO_BETA; /* Valid.Cookie.Life - 60 seconds */ net->sctp.valid_cookie_life = SCTP_DEFAULT_COOKIE_LIFE; /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; /* Default sctp sockets to use md5 as their hmac alg */ #if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) net->sctp.sctp_hmac_alg = "md5"; #elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) net->sctp.sctp_hmac_alg = "sha1"; #else net->sctp.sctp_hmac_alg = NULL; #endif /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; /* Enable pf state by default */ net->sctp.pf_enable = 1; /* Association.Max.Retrans - 10 attempts * Path.Max.Retrans - 5 attempts (per destination address) * Max.Init.Retransmits - 8 attempts */ net->sctp.max_retrans_association = 10; net->sctp.max_retrans_path = 5; net->sctp.max_retrans_init = 8; /* Sendbuffer growth - do per-socket accounting */ net->sctp.sndbuf_policy = 0; /* Rcvbuffer growth - do per-socket accounting */ net->sctp.rcvbuf_policy = 0; /* HB.interval - 30 seconds */ net->sctp.hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; /* delayed SACK timeout */ net->sctp.sack_timeout = SCTP_DEFAULT_TIMEOUT_SACK; /* Disable ADDIP by default. */ net->sctp.addip_enable = 0; net->sctp.addip_noauth = 0; net->sctp.default_auto_asconf = 0; /* Enable PR-SCTP by default. */ net->sctp.prsctp_enable = 1; /* Disable RECONF by default. */ net->sctp.reconf_enable = 0; /* Disable AUTH by default. */ net->sctp.auth_enable = 0; /* Set SCOPE policy to enabled */ net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE; /* Set the default rwnd update threshold */ net->sctp.rwnd_upd_shift = SCTP_DEFAULT_RWND_SHIFT; /* Initialize maximum autoclose timeout. */ net->sctp.max_autoclose = INT_MAX / HZ; status = sctp_sysctl_net_register(net); if (status) goto err_sysctl_register; /* Allocate and initialise sctp mibs. */ status = init_sctp_mibs(net); if (status) goto err_init_mibs; /* Initialize proc fs directory. */ status = sctp_proc_init(net); if (status) goto err_init_proc; sctp_dbg_objcnt_init(net); /* Initialize the local address list. */ INIT_LIST_HEAD(&net->sctp.local_addr_list); spin_lock_init(&net->sctp.local_addr_lock); sctp_get_local_addr_list(net); /* Initialize the address event list */ INIT_LIST_HEAD(&net->sctp.addr_waitq); INIT_LIST_HEAD(&net->sctp.auto_asconf_splist); spin_lock_init(&net->sctp.addr_wq_lock); net->sctp.addr_wq_timer.expires = 0; setup_timer(&net->sctp.addr_wq_timer, sctp_addr_wq_timeout_handler, (unsigned long)net); return 0; err_init_proc: cleanup_sctp_mibs(net); err_init_mibs: sctp_sysctl_net_unregister(net); err_sysctl_register: return status; } static void __net_exit sctp_defaults_exit(struct net *net) { /* Free the local address list */ sctp_free_addr_wq(net); sctp_free_local_addr_list(net); sctp_dbg_objcnt_exit(net); sctp_proc_exit(net); cleanup_sctp_mibs(net); sctp_sysctl_net_unregister(net); } static struct pernet_operations sctp_defaults_ops = { .init = sctp_defaults_init, .exit = sctp_defaults_exit, }; static int __net_init sctp_ctrlsock_init(struct net *net) { int status; /* Initialize the control inode/socket for handling OOTB packets. */ status = sctp_ctl_sock_init(net); if (status) pr_err("Failed to initialize the SCTP control sock\n"); return status; } static void __net_exit sctp_ctrlsock_exit(struct net *net) { /* Free the control endpoint. */ inet_ctl_sock_destroy(net->sctp.ctl_sock); } static struct pernet_operations sctp_ctrlsock_ops = { .init = sctp_ctrlsock_init, .exit = sctp_ctrlsock_exit, }; /* Initialize the universe into something sensible. */ static __init int sctp_init(void) { int i; int status = -EINVAL; unsigned long goal; unsigned long limit; int max_share; int order; int num_entries; int max_entry_order; sock_skb_cb_check_size(sizeof(struct sctp_ulpevent)); /* Allocate bind_bucket and chunk caches. */ status = -ENOBUFS; sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket", sizeof(struct sctp_bind_bucket), 0, SLAB_HWCACHE_ALIGN, NULL); if (!sctp_bucket_cachep) goto out; sctp_chunk_cachep = kmem_cache_create("sctp_chunk", sizeof(struct sctp_chunk), 0, SLAB_HWCACHE_ALIGN, NULL); if (!sctp_chunk_cachep) goto err_chunk_cachep; status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); if (status) goto err_percpu_counter_init; /* Implementation specific variables. */ /* Initialize default stream count setup information. */ sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; /* Initialize handle used for association ids. */ idr_init(&sctp_assocs_id); limit = nr_free_buffer_pages() / 8; limit = max(limit, 128UL); sysctl_sctp_mem[0] = limit / 4 * 3; sysctl_sctp_mem[1] = limit; sysctl_sctp_mem[2] = sysctl_sctp_mem[0] * 2; /* Set per-socket limits to no more than 1/128 the pressure threshold*/ limit = (sysctl_sctp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); sysctl_sctp_rmem[0] = SK_MEM_QUANTUM; /* give each asoc 1 page min */ sysctl_sctp_rmem[1] = 1500 * SKB_TRUESIZE(1); sysctl_sctp_rmem[2] = max(sysctl_sctp_rmem[1], max_share); sysctl_sctp_wmem[0] = SK_MEM_QUANTUM; sysctl_sctp_wmem[1] = 16*1024; sysctl_sctp_wmem[2] = max(64*1024, max_share); /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. * Though not identical. Start by getting a goal size */ if (totalram_pages >= (128 * 1024)) goal = totalram_pages >> (22 - PAGE_SHIFT); else goal = totalram_pages >> (24 - PAGE_SHIFT); /* Then compute the page order for said goal */ order = get_order(goal); /* Now compute the required page order for the maximum sized table we * want to create */ max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES * sizeof(struct sctp_bind_hashbucket)); /* Limit the page order by that maximum hash table size */ order = min(order, max_entry_order); /* Allocate and initialize the endpoint hash table. */ sctp_ep_hashsize = 64; sctp_ep_hashtable = kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); if (!sctp_ep_hashtable) { pr_err("Failed endpoint_hash alloc\n"); status = -ENOMEM; goto err_ehash_alloc; } for (i = 0; i < sctp_ep_hashsize; i++) { rwlock_init(&sctp_ep_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain); } /* Allocate and initialize the SCTP port hash table. * Note that order is initalized to start at the max sized * table we want to support. If we can't get that many pages * reduce the order and try again */ do { sctp_port_hashtable = (struct sctp_bind_hashbucket *) __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } /* Now compute the number of entries that will fit in the * port hash space we allocated */ num_entries = (1UL << order) * PAGE_SIZE / sizeof(struct sctp_bind_hashbucket); /* And finish by rounding it down to the nearest power of two * this wastes some memory of course, but its needed because * the hash function operates based on the assumption that * that the number of entries is a power of two */ sctp_port_hashsize = rounddown_pow_of_two(num_entries); for (i = 0; i < sctp_port_hashsize; i++) { spin_lock_init(&sctp_port_hashtable[i].lock); INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } status = sctp_transport_hashtable_init(); if (status) goto err_thash_alloc; pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize, num_entries); sctp_sysctl_register(); INIT_LIST_HEAD(&sctp_address_families); sctp_v4_pf_init(); sctp_v6_pf_init(); status = register_pernet_subsys(&sctp_defaults_ops); if (status) goto err_register_defaults; status = sctp_v4_protosw_init(); if (status) goto err_protosw_init; status = sctp_v6_protosw_init(); if (status) goto err_v6_protosw_init; status = register_pernet_subsys(&sctp_ctrlsock_ops); if (status) goto err_register_ctrlsock; status = sctp_v4_add_protocol(); if (status) goto err_add_protocol; /* Register SCTP with inet6 layer. */ status = sctp_v6_add_protocol(); if (status) goto err_v6_add_protocol; if (sctp_offload_init() < 0) pr_crit("%s: Cannot add SCTP protocol offload\n", __func__); out: return status; err_v6_add_protocol: sctp_v4_del_protocol(); err_add_protocol: unregister_pernet_subsys(&sctp_ctrlsock_ops); err_register_ctrlsock: sctp_v6_protosw_exit(); err_v6_protosw_init: sctp_v4_protosw_exit(); err_protosw_init: unregister_pernet_subsys(&sctp_defaults_ops); err_register_defaults: sctp_v4_pf_exit(); sctp_v6_pf_exit(); sctp_sysctl_unregister(); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); err_bhash_alloc: sctp_transport_hashtable_destroy(); err_thash_alloc: kfree(sctp_ep_hashtable); err_ehash_alloc: percpu_counter_destroy(&sctp_sockets_allocated); err_percpu_counter_init: kmem_cache_destroy(sctp_chunk_cachep); err_chunk_cachep: kmem_cache_destroy(sctp_bucket_cachep); goto out; } /* Exit handler for the SCTP protocol. */ static __exit void sctp_exit(void) { /* BUG. This should probably do something useful like clean * up all the remaining associations and all that memory. */ /* Unregister with inet6/inet layers. */ sctp_v6_del_protocol(); sctp_v4_del_protocol(); unregister_pernet_subsys(&sctp_ctrlsock_ops); /* Free protosw registrations */ sctp_v6_protosw_exit(); sctp_v4_protosw_exit(); unregister_pernet_subsys(&sctp_defaults_ops); /* Unregister with socket layer. */ sctp_v6_pf_exit(); sctp_v4_pf_exit(); sctp_sysctl_unregister(); free_pages((unsigned long)sctp_port_hashtable, get_order(sctp_port_hashsize * sizeof(struct sctp_bind_hashbucket))); kfree(sctp_ep_hashtable); sctp_transport_hashtable_destroy(); percpu_counter_destroy(&sctp_sockets_allocated); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(sctp_chunk_cachep); kmem_cache_destroy(sctp_bucket_cachep); } module_init(sctp_init); module_exit(sctp_exit); /* * __stringify doesn't likes enums, so use IPPROTO_SCTP value (132) directly. */ MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-132"); MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-132"); MODULE_AUTHOR("Linux Kernel SCTP developers <linux-sctp@vger.kernel.org>"); MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); module_param_named(no_checksums, sctp_checksum_disable, bool, 0644); MODULE_PARM_DESC(no_checksums, "Disable checksums computing and verification"); MODULE_LICENSE("GPL");
738 791 5986 5323 451 5358 708 3 450 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 * x86-64 work by Andi Kleen 2002 */ #ifndef _ASM_X86_FPU_INTERNAL_H #define _ASM_X86_FPU_INTERNAL_H #include <linux/compat.h> #include <linux/sched.h> #include <linux/slab.h> #include <asm/user.h> #include <asm/fpu/api.h> #include <asm/fpu/xstate.h> #include <asm/cpufeature.h> #include <asm/trace/fpu.h> /* * High level FPU state handling functions: */ extern void fpu__initialize(struct fpu *fpu); extern void fpu__prepare_read(struct fpu *fpu); extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); extern void fpu__clear(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); /* * Boot time FPU initialization functions: */ extern void fpu__init_cpu(void); extern void fpu__init_system_xstate(void); extern void fpu__init_cpu_xstate(void); extern void fpu__init_system(struct cpuinfo_x86 *c); extern void fpu__init_check_bugs(void); extern void fpu__resume_cpu(void); extern u64 fpu__get_supported_xfeatures_mask(void); /* * Debugging facility: */ #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else # define WARN_ON_FPU(x) ({ (void)(x); 0; }) #endif /* * FPU related CPU feature flag helper routines: */ static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has(X86_FEATURE_XSAVEOPT); } static __always_inline __pure bool use_xsave(void) { return static_cpu_has(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { return static_cpu_has(X86_FEATURE_FXSR); } /* * fpstate handling functions: */ extern union fpregs_state init_fpstate; extern void fpstate_init(union fpregs_state *state); #ifdef CONFIG_MATH_EMULATION extern void fpstate_init_soft(struct swregs_state *soft); #else static inline void fpstate_init_soft(struct swregs_state *soft) {} #endif static inline void fpstate_init_xstate(struct xregs_state *xsave) { /* * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask; } static inline void fpstate_init_fxstate(struct fxregs_state *fx) { fx->cwd = 0x37f; fx->mxcsr = MXCSR_DEFAULT; } extern void fpstate_sanitize_xstate(struct fpu *fpu); /* Returns 0 or the negated trap number, which results in -EFAULT for #PF */ #define user_insn(insn, output, input...) \ ({ \ int err; \ \ might_fault(); \ \ asm volatile(ASM_STAC "\n" \ "1: " #insn "\n" \ "2: " ASM_CLAC "\n" \ ".section .fixup,\"ax\"\n" \ "3: negl %%eax\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE_FAULT(1b, 3b) \ : [err] "=a" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_fprestore) \ : output : input) static inline int copy_fregs_to_user(struct fregs_state __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } static inline int copy_fxregs_to_user(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); /* See comment in copy_fxregs_to_kernel() below. */ return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx)); } static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) { kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) { kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } else { /* See comment in copy_fxregs_to_kernel() below. */ kernel_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } } } static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); /* See comment in copy_fxregs_to_kernel() below. */ return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), "m" (*fx)); } static inline void copy_kernel_to_fregs(struct fregs_state *fx) { kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int copy_user_to_fregs(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void copy_fxregs_to_kernel(struct fpu *fpu) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave)); else if (IS_ENABLED(CONFIG_AS_FXSAVEQ)) asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave)); else { /* Using "rex64; fxsave %0" is broken because, if the memory * operand uses any extended registers for addressing, a second * REX prefix will be generated (to the assembler, rex64 * followed by semicolon is a separate instruction), and hence * the 64-bitness is lost. * * Using "fxsaveq %0" would be the ideal choice, but is only * supported starting with gas 2.16. * * Using, as a workaround, the properly prefixed form below * isn't accepted by any binutils version so far released, * complaining that the same type of prefix is used twice if * an extended register is needed for addressing (fix submitted * to mainline 2005-11-21). * * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave)); * * This, however, we can work around by forcing the compiler to * select an addressing mode that doesn't require extended * registers. */ asm volatile( "rex64/fxsave (%[fx])" : "=m" (fpu->state.fxsave) : [fx] "R" (&fpu->state.fxsave)); } } static inline void fxsave(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); else asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); } /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" #define XSAVES ".byte " REX_PREFIX "0x0f,0xc7,0x2f" #define XRSTOR ".byte " REX_PREFIX "0x0f,0xae,0x2f" #define XRSTORS ".byte " REX_PREFIX "0x0f,0xc7,0x1f" /* * After this @err contains 0 on success or the negated trap number when * the operation raises an exception. For faults this results in -EFAULT. */ #define XSTATE_OP(op, st, lmask, hmask, err) \ asm volatile("1:" op "\n\t" \ "xor %[err], %[err]\n" \ "2:\n\t" \ ".pushsection .fixup,\"ax\"\n\t" \ "3: negl %%eax\n\t" \ "jmp 2b\n\t" \ ".popsection\n\t" \ _ASM_EXTABLE_FAULT(1b, 3b) \ : [err] "=a" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * If XSAVES is enabled, it replaces XSAVEOPT because it supports a compact * format and supervisor states in addition to modified optimization in * XSAVEOPT. * * Otherwise, if XSAVEOPT is enabled, XSAVEOPT replaces XSAVE because XSAVEOPT * supports modified optimization which is not supported by XSAVE. * * We use XSAVE as a fallback. * * The 661 label is defined in the ALTERNATIVE* macros as the address of the * original instruction which gets replaced. We need to use it here as the * address of the instruction where we might get an exception at. */ #define XSTATE_XSAVE(st, lmask, hmask, err) \ asm volatile(ALTERNATIVE_2(XSAVE, \ XSAVEOPT, X86_FEATURE_XSAVEOPT, \ XSAVES, X86_FEATURE_XSAVES) \ "\n" \ "xor %[err], %[err]\n" \ "3:\n" \ ".pushsection .fixup,\"ax\"\n" \ "4: movl $-2, %[err]\n" \ "jmp 3b\n" \ ".popsection\n" \ _ASM_EXTABLE(661b, 4b) \ : [err] "=r" (err) \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * Use XRSTORS to restore context if it is enabled. XRSTORS supports compact * XSAVE area format. */ #define XSTATE_XRESTORE(st, lmask, hmask) \ asm volatile(ALTERNATIVE(XRSTOR, \ XRSTORS, X86_FEATURE_XSAVES) \ "\n" \ "3:\n" \ _ASM_EXTABLE_HANDLE(661b, 3b, ex_handler_fprestore)\ : \ : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory") /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. */ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) { u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; int err; WARN_ON(system_state != SYSTEM_BOOTING); if (static_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); /* * We should never fault when copying from a kernel buffer, and the FPU * state we set at boot time should be valid. */ WARN_ON_FPU(err); } /* * Save processor xstate to xsave area. */ static inline void copy_xregs_to_kernel(struct xregs_state *xstate) { u64 mask = -1; u32 lmask = mask; u32 hmask = mask >> 32; int err; WARN_ON_FPU(!alternatives_patched); XSTATE_XSAVE(xstate, lmask, hmask, err); /* We should never fault when copying to a kernel buffer: */ WARN_ON_FPU(err); } /* * Restore processor xstate from xsave area. */ static inline void copy_kernel_to_xregs(struct xregs_state *xstate, u64 mask) { u32 lmask = mask; u32 hmask = mask >> 32; XSTATE_XRESTORE(xstate, lmask, hmask); } /* * Save xstate to user space xsave area. * * We don't use modified optimization because xrstor/xrstors might track * a different application. * * We don't use compacted format xsave area for * backward compatibility for old applications which don't understand * compacted format of xsave area. */ static inline int copy_xregs_to_user(struct xregs_state __user *buf) { int err; /* * Clear the xsave header first, so that reserved fields are * initialized to zero. */ err = __clear_user(&buf->header, sizeof(buf->header)); if (unlikely(err)) return -EFAULT; stac(); XSTATE_OP(XSAVE, buf, -1, -1, err); clac(); return err; } /* * Restore xstate from user space xsave area. */ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) { struct xregs_state *xstate = ((__force struct xregs_state *)buf); u32 lmask = mask; u32 hmask = mask >> 32; int err; stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); clac(); return err; } /* * These must be called with preempt disabled. Returns * 'true' if the FPU state is still intact and we can * keep registers active. * * The legacy FNSAVE instruction cleared all FPU state * unconditionally, so registers are essentially destroyed. * Modern FPU state can be kept in registers, if there are * no pending FP exceptions. */ static inline int copy_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { copy_xregs_to_kernel(&fpu->state.xsave); return 1; } if (likely(use_fxsr())) { copy_fxregs_to_kernel(fpu); return 1; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to mark them inactive: */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); return 0; } static inline void __copy_kernel_to_fpregs(union fpregs_state *fpstate, u64 mask) { if (use_xsave()) { copy_kernel_to_xregs(&fpstate->xsave, mask); } else { if (use_fxsr()) copy_kernel_to_fxregs(&fpstate->fxsave); else copy_kernel_to_fregs(&fpstate->fsave); } } static inline void copy_kernel_to_fpregs(union fpregs_state *fpstate) { /* * AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is * pending. Clear the x87 state here by setting it to fixed values. * "m" is a random variable that should be in L1. */ if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) { asm volatile( "fnclex\n\t" "emms\n\t" "fildl %P[addr]" /* set F?P to defined value */ : : [addr] "m" (fpstate)); } __copy_kernel_to_fpregs(fpstate, -1); } extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size); /* * FPU context switch related helper methods: */ DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* * The in-register FPU state for an FPU context on a CPU is assumed to be * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx * matches the FPU. * * If the FPU register state is valid, the kernel can skip restoring the * FPU state from memory. * * Any code that clobbers the FPU registers or updates the in-memory * FPU state for a task MUST let the rest of the kernel know that the * FPU registers are no longer valid for this task. * * Either one of these invalidation functions is enough. Invalidate * a resource you control: CPU if using the CPU for something else * (with preemption disabled), FPU for the current task, or a task that * is prevented from running by the current task. */ static inline void __cpu_invalidate_fpregs_state(void) { __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { fpu->last_cpu = -1; } static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } /* * These generally need preemption protection to work, * do try to avoid using these on their own: */ static inline void fpregs_deactivate(struct fpu *fpu) { this_cpu_write(fpu_fpregs_owner_ctx, NULL); trace_x86_fpu_regs_deactivated(fpu); } static inline void fpregs_activate(struct fpu *fpu) { this_cpu_write(fpu_fpregs_owner_ctx, fpu); trace_x86_fpu_regs_activated(fpu); } /* * FPU state switching for scheduling. * * This is a two-stage process: * * - switch_fpu_prepare() saves the old state. * This is done within the context of the old process. * * - switch_fpu_finish() restores the new state as * necessary. */ static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else old_fpu->last_cpu = cpu; /* But leave fpu_fpregs_owner_ctx! */ trace_x86_fpu_regs_deactivated(old_fpu); } else old_fpu->last_cpu = -1; } /* * Misc helper functions: */ /* * Set up the userspace FPU context for the new task, if the task * has used the FPU. */ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { bool preload = static_cpu_has(X86_FEATURE_FPU) && new_fpu->initialized; if (preload) { if (!fpregs_state_valid(new_fpu, cpu)) copy_kernel_to_fpregs(&new_fpu->state); fpregs_activate(new_fpu); } } /* * Needs to be preemption-safe. * * NOTE! user_fpu_begin() must be used only immediately before restoring * the save state. It does not do any saving/restoring on its own. In * lazy FPU mode, it is just an optimization to avoid a #NM exception, * the task can lose the FPU right after preempt_enable(). */ static inline void user_fpu_begin(void) { struct fpu *fpu = &current->thread.fpu; preempt_disable(); fpregs_activate(fpu); preempt_enable(); } /* * MXCSR and XCR definitions: */ extern unsigned int mxcsr_feature_mask; #define XCR_XFEATURE_ENABLED_MASK 0x00000000 static inline u64 xgetbv(u32 index) { u32 eax, edx; asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ : "=a" (eax), "=d" (edx) : "c" (index)); return eax + ((u64)edx << 32); } static inline void xsetbv(u32 index, u64 value) { u32 eax = value; u32 edx = value >> 32; asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */ : : "a" (eax), "d" (edx), "c" (index)); } #endif /* _ASM_X86_FPU_INTERNAL_H */
3114 18 9528 74 3356 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VMSTAT_H #define _LINUX_VMSTAT_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/mmzone.h> #include <linux/vm_event_item.h> #include <linux/atomic.h> extern int sysctl_stat_interval; #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. * * Counters should only be incremented and no critical kernel component * should rely on the counter values. * * Counters are handled completely inline. On many platforms the code * generated will simply be the increment of a global address. */ struct vm_event_state { unsigned long event[NR_VM_EVENT_ITEMS]; }; DECLARE_PER_CPU(struct vm_event_state, vm_event_states); /* * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the * local_irq_disable overhead. */ static inline void __count_vm_event(enum vm_event_item item) { raw_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) { this_cpu_inc(vm_event_states.event[item]); } static inline void __count_vm_events(enum vm_event_item item, long delta) { raw_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) { this_cpu_add(vm_event_states.event[item], delta); } extern void all_vm_events(unsigned long *); extern void vm_events_fold_cpu(int cpu); #else /* Disable counters */ static inline void count_vm_event(enum vm_event_item item) { } static inline void count_vm_events(enum vm_event_item item, long delta) { } static inline void __count_vm_event(enum vm_event_item item) { } static inline void __count_vm_events(enum vm_event_item item, long delta) { } static inline void all_vm_events(unsigned long *ret) { } static inline void vm_events_fold_cpu(int cpu) { } #endif /* CONFIG_VM_EVENT_COUNTERS */ #ifdef CONFIG_NUMA_BALANCING #define count_vm_numa_event(x) count_vm_event(x) #define count_vm_numa_events(x, y) count_vm_events(x, y) #else #define count_vm_numa_event(x) do {} while (0) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_DEBUG_TLBFLUSH #define count_vm_tlb_event(x) count_vm_event(x) #define count_vm_tlb_events(x, y) count_vm_events(x, y) #else #define count_vm_tlb_event(x) do {} while (0) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif #ifdef CONFIG_DEBUG_VM_VMACACHE #define count_vm_vmacache_event(x) count_vm_event(x) #else #define count_vm_vmacache_event(x) do {} while (0) #endif #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) /* * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA static inline void zone_numa_state_add(long x, struct zone *zone, enum numa_stat_item item) { atomic_long_add(x, &zone->vm_numa_stat[item]); atomic_long_add(x, &vm_numa_stat[item]); } static inline unsigned long global_numa_state(enum numa_stat_item item) { long x = atomic_long_read(&vm_numa_stat[item]); return x; } static inline unsigned long zone_numa_state_snapshot(struct zone *zone, enum numa_stat_item item) { long x = atomic_long_read(&zone->vm_numa_stat[item]); int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; return x; } #endif /* CONFIG_NUMA */ static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { atomic_long_add(x, &zone->vm_stat[item]); atomic_long_add(x, &vm_zone_stat[item]); } static inline void node_page_state_add(long x, struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_add(x, &pgdat->vm_stat[item]); atomic_long_add(x, &vm_node_stat[item]); } static inline unsigned long global_zone_page_state(enum zone_stat_item item) { long x = atomic_long_read(&vm_zone_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state(enum node_stat_item item) { long x = atomic_long_read(&vm_node_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long zone_page_state(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } /* * More accurate version that also considers the currently pending * deltas. For that we need to loop over all cpus to find the current * deltas. There is no synchronization so the result cannot be * exactly accurate either. */ static inline unsigned long zone_page_state_snapshot(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; #endif return x; } static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, enum node_stat_item item) { long x = atomic_long_read(&pgdat->vm_stat[item]); #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; if (x < 0) x = 0; #endif return x; } #ifdef CONFIG_NUMA extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #endif /* CONFIG_NUMA */ #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) #define add_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, __d) #define sub_node_page_state(__p, __i, __d) mod_node_page_state(__p, __i, -(__d)) #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __inc_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item); void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long); void __inc_node_page_state(struct page *, enum node_stat_item); void __dec_node_page_state(struct page *, enum node_stat_item); void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void inc_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item); void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void __user *buffer, size_t *lenp, loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); void set_pgdat_percpu_threshold(pg_data_t *pgdat, int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* * We do not maintain differentials in a single processor configuration. * The functions directly modify the zone and global counters. */ static inline void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { zone_page_state_add(delta, zone, item); } static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { node_page_state_add(delta, pgdat, item); } static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_inc(&zone->vm_stat[item]); atomic_long_inc(&vm_zone_stat[item]); } static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_inc(&pgdat->vm_stat[item]); atomic_long_inc(&vm_node_stat[item]); } static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&vm_zone_stat[item]); } static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_dec(&pgdat->vm_stat[item]); atomic_long_dec(&vm_node_stat[item]); } static inline void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } static inline void __inc_node_page_state(struct page *page, enum node_stat_item item) { __inc_node_state(page_pgdat(page), item); } static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } static inline void __dec_node_page_state(struct page *page, enum node_stat_item item) { __dec_node_state(page_pgdat(page), item); } /* * We only use atomic operations to update counters. So there is no need to * disable interrupts. */ #define inc_zone_page_state __inc_zone_page_state #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state #define inc_node_page_state __inc_node_page_state #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state #define inc_zone_state __inc_zone_state #define inc_node_state __inc_node_state #define dec_zone_state __dec_zone_state #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } #endif /* CONFIG_SMP */ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, int migratetype) { __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } extern const char * const vmstat_text[]; #endif /* _LINUX_VMSTAT_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 /* * Handle firewalling core * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> * Bart De Schuymer <bdschuym@pandora.be> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Lennert dedicates this file to Kerstin Wurdinger. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/in_route.h> #include <linux/inetdevice.h> #include <net/route.h> #include "br_private.h" #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { } static void fake_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { } static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; } static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { return NULL; } static unsigned int fake_mtu(const struct dst_entry *dst) { return dst->dev->mtu; } static struct dst_ops fake_dst_ops = { .family = AF_INET, .update_pmtu = fake_update_pmtu, .redirect = fake_redirect, .cow_metrics = fake_cow_metrics, .neigh_lookup = fake_neigh_lookup, .mtu = fake_mtu, }; /* * Initialize bogus route table used to keep netfilter happy. * Currently, we fill in the PMTU entry because netfilter * refragmentation needs it, and the rt_flags entry because * ipt_REJECT needs it. Future netfilter modules might * require us to fill additional fields. */ static const u32 br_dst_default_metrics[RTAX_MAX] = { [RTAX_MTU - 1] = 1500, }; void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; atomic_set(&rt->dst.__refcnt, 1); rt->dst.dev = br->dev; rt->dst.path = &rt->dst; dst_init_metrics(&rt->dst, br_dst_default_metrics, true); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } int __init br_nf_core_init(void) { return dst_entries_init(&fake_dst_ops); } void br_nf_core_fini(void) { dst_entries_destroy(&fake_dst_ops); }
43 1 1 6 4 4 4 41 46 46 11 6 4 1 6 7 7 7 7 3 7 5 7 4 7 4 7 1 4 7 5 4 7 4 7 7 6 18 4 3 1 14 12 1 16 9 9 15 7 6 13 11 2 1 2 9 9 9 1 11 11 10 1 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 // SPDX-License-Identifier: GPL-2.0 /* * NTP state machine interfaces and logic. * * This code was mainly moved from kernel/timer.c and kernel/time.c * Please see those files for relevant copyright info and historical * changelogs. */ #include <linux/capability.h> #include <linux/clocksource.h> #include <linux/workqueue.h> #include <linux/hrtimer.h> #include <linux/jiffies.h> #include <linux/math64.h> #include <linux/timex.h> #include <linux/time.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/rtc.h> #include <linux/math64.h> #include "ntp_internal.h" #include "timekeeping_internal.h" /* * NTP timekeeping variables: * * Note: All of the NTP state is protected by the timekeeping locks. */ /* USER_HZ period (usecs): */ unsigned long tick_usec = TICK_USEC; /* SHIFTED_HZ period (nsecs): */ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; #define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) #define MAX_TAI_OFFSET 100000 /* * phase-lock loop variables */ /* * clock synchronization status * * (TIME_ERROR prevents overwriting the CMOS clock) */ static int time_state = TIME_OK; /* clock status bits: */ static int time_status = STA_UNSYNC; /* time adjustment (nsecs): */ static s64 time_offset; /* pll time constant: */ static long time_constant = 2; /* maximum error (usecs): */ static long time_maxerror = NTP_PHASE_LIMIT; /* estimated error (usecs): */ static long time_esterror = NTP_PHASE_LIMIT; /* frequency offset (scaled nsecs/secs): */ static s64 time_freq; /* time at last adjustment (secs): */ static time64_t time_reftime; static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; /* second value of the next pending leapsecond, or TIME64_MAX if no leap */ static time64_t ntp_next_leap_sec = TIME64_MAX; #ifdef CONFIG_NTP_PPS /* * The following variables are used when a pulse-per-second (PPS) signal * is available. They establish the engineering parameters of the clock * discipline loop when controlled by the PPS signal. */ #define PPS_VALID 10 /* PPS signal watchdog max (s) */ #define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ #define PPS_INTMIN 2 /* min freq interval (s) (shift) */ #define PPS_INTMAX 8 /* max freq interval (s) (shift) */ #define PPS_INTCOUNT 4 /* number of consecutive good intervals to increase pps_shift or consecutive bad intervals to decrease it */ #define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ static int pps_valid; /* signal watchdog counter */ static long pps_tf[3]; /* phase median filter */ static long pps_jitter; /* current jitter (ns) */ static struct timespec64 pps_fbase; /* beginning of the last freq interval */ static int pps_shift; /* current interval duration (s) (shift) */ static int pps_intcnt; /* interval counter */ static s64 pps_freq; /* frequency offset (scaled ns/s) */ static long pps_stabil; /* current stability (scaled ns/s) */ /* * PPS signal quality monitors */ static long pps_calcnt; /* calibration intervals */ static long pps_jitcnt; /* jitter limit exceeded */ static long pps_stbcnt; /* stability limit exceeded */ static long pps_errcnt; /* calibration errors */ /* PPS kernel consumer compensates the whole phase error immediately. * Otherwise, reduce the offset by a fixed factor times the time constant. */ static inline s64 ntp_offset_chunk(s64 offset) { if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) return offset; else return shift_right(offset, SHIFT_PLL + time_constant); } static inline void pps_reset_freq_interval(void) { /* the PPS calibration interval may end surprisingly early */ pps_shift = PPS_INTMIN; pps_intcnt = 0; } /** * pps_clear - Clears the PPS state variables */ static inline void pps_clear(void) { pps_reset_freq_interval(); pps_tf[0] = 0; pps_tf[1] = 0; pps_tf[2] = 0; pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; pps_freq = 0; } /* Decrease pps_valid to indicate that another second has passed since * the last PPS signal. When it reaches 0, indicate that PPS signal is * missing. */ static inline void pps_dec_valid(void) { if (pps_valid > 0) pps_valid--; else { time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); pps_clear(); } } static inline void pps_set_freq(s64 freq) { pps_freq = freq; } static inline int is_error_status(int status) { return (status & (STA_UNSYNC|STA_CLOCKERR)) /* PPS signal lost when either PPS time or * PPS frequency synchronization requested */ || ((status & (STA_PPSFREQ|STA_PPSTIME)) && !(status & STA_PPSSIGNAL)) /* PPS jitter exceeded when * PPS time synchronization requested */ || ((status & (STA_PPSTIME|STA_PPSJITTER)) == (STA_PPSTIME|STA_PPSJITTER)) /* PPS wander exceeded or calibration error when * PPS frequency synchronization requested */ || ((status & STA_PPSFREQ) && (status & (STA_PPSWANDER|STA_PPSERROR))); } static inline void pps_fill_timex(struct timex *txc) { txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->jitter = pps_jitter; if (!(time_status & STA_NANO)) txc->jitter /= NSEC_PER_USEC; txc->shift = pps_shift; txc->stabil = pps_stabil; txc->jitcnt = pps_jitcnt; txc->calcnt = pps_calcnt; txc->errcnt = pps_errcnt; txc->stbcnt = pps_stbcnt; } #else /* !CONFIG_NTP_PPS */ static inline s64 ntp_offset_chunk(s64 offset) { return shift_right(offset, SHIFT_PLL + time_constant); } static inline void pps_reset_freq_interval(void) {} static inline void pps_clear(void) {} static inline void pps_dec_valid(void) {} static inline void pps_set_freq(s64 freq) {} static inline int is_error_status(int status) { return status & (STA_UNSYNC|STA_CLOCKERR); } static inline void pps_fill_timex(struct timex *txc) { /* PPS is not implemented, so these are zero */ txc->ppsfreq = 0; txc->jitter = 0; txc->shift = 0; txc->stabil = 0; txc->jitcnt = 0; txc->calcnt = 0; txc->errcnt = 0; txc->stbcnt = 0; } #endif /* CONFIG_NTP_PPS */ /** * ntp_synced - Returns 1 if the NTP status is not UNSYNC * */ static inline int ntp_synced(void) { return !(time_status & STA_UNSYNC); } /* * NTP methods: */ /* * Update (tick_length, tick_length_base, tick_nsec), based * on (tick_usec, ntp_tick_adj, time_freq): */ static void ntp_update_frequency(void) { u64 second_length; u64 new_base; second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << NTP_SCALE_SHIFT; second_length += ntp_tick_adj; second_length += time_freq; tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; new_base = div_u64(second_length, NTP_INTERVAL_FREQ); /* * Don't wait for the next second_overflow, apply * the change to the tick length immediately: */ tick_length += new_base - tick_length_base; tick_length_base = new_base; } static inline s64 ntp_update_offset_fll(s64 offset64, long secs) { time_status &= ~STA_MODE; if (secs < MINSEC) return 0; if (!(time_status & STA_FLL) && (secs <= MAXSEC)) return 0; time_status |= STA_MODE; return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); } static void ntp_update_offset(long offset) { s64 freq_adj; s64 offset64; long secs; if (!(time_status & STA_PLL)) return; if (!(time_status & STA_NANO)) { /* Make sure the multiplication below won't overflow */ offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); offset *= NSEC_PER_USEC; } /* * Scale the phase adjustment and * clamp to the operating range. */ offset = clamp(offset, -MAXPHASE, MAXPHASE); /* * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ secs = (long)(__ktime_get_real_seconds() - time_reftime); if (unlikely(time_status & STA_FREQHOLD)) secs = 0; time_reftime = __ktime_get_real_seconds(); offset64 = offset; freq_adj = ntp_update_offset_fll(offset64, secs); /* * Clamp update interval to reduce PLL gain with low * sampling rate (e.g. intermittent network connection) * to avoid instability. */ if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) secs = 1 << (SHIFT_PLL + 1 + time_constant); freq_adj += (offset64 * secs) << (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); time_freq = max(freq_adj, -MAXFREQ_SCALED); time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); } /** * ntp_clear - Clears the NTP state variables */ void ntp_clear(void) { time_adjust = 0; /* stop active adjtime() */ time_status |= STA_UNSYNC; time_maxerror = NTP_PHASE_LIMIT; time_esterror = NTP_PHASE_LIMIT; ntp_update_frequency(); tick_length = tick_length_base; time_offset = 0; ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ pps_clear(); } u64 ntp_tick_length(void) { return tick_length; } /** * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t * * Provides the time of the next leapsecond against CLOCK_REALTIME in * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. */ ktime_t ntp_get_next_leap(void) { ktime_t ret; if ((time_state == TIME_INS) && (time_status & STA_INS)) return ktime_set(ntp_next_leap_sec, 0); ret = KTIME_MAX; return ret; } /* * this routine handles the overflow of the microsecond field * * The tricky bits of code to handle the accurate clock support * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. * They were originally developed for SUN and DEC kernels. * All the kudos should go to Dave for this stuff. * * Also handles leap second processing, and returns leap offset */ int second_overflow(time64_t secs) { s64 delta; int leap = 0; s32 rem; /* * Leap second processing. If in leap-insert state at the end of the * day, the system clock is set back one second; if in leap-delete * state, the system clock is set ahead one second. */ switch (time_state) { case TIME_OK: if (time_status & STA_INS) { time_state = TIME_INS; div_s64_rem(secs, SECS_PER_DAY, &rem); ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } else if (time_status & STA_DEL) { time_state = TIME_DEL; div_s64_rem(secs + 1, SECS_PER_DAY, &rem); ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } break; case TIME_INS: if (!(time_status & STA_INS)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; } else if (secs == ntp_next_leap_sec) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n"); } break; case TIME_DEL: if (!(time_status & STA_DEL)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; } else if (secs == ntp_next_leap_sec) { leap = 1; ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; break; case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; break; } /* Bump the maxerror field */ time_maxerror += MAXFREQ / NSEC_PER_USEC; if (time_maxerror > NTP_PHASE_LIMIT) { time_maxerror = NTP_PHASE_LIMIT; time_status |= STA_UNSYNC; } /* Compute the phase adjustment for the next second */ tick_length = tick_length_base; delta = ntp_offset_chunk(time_offset); time_offset -= delta; tick_length += delta; /* Check PPS signal */ pps_dec_valid(); if (!time_adjust) goto out; if (time_adjust > MAX_TICKADJ) { time_adjust -= MAX_TICKADJ; tick_length += MAX_TICKADJ_SCALED; goto out; } if (time_adjust < -MAX_TICKADJ) { time_adjust += MAX_TICKADJ; tick_length -= MAX_TICKADJ_SCALED; goto out; } tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT; time_adjust = 0; out: return leap; } #ifdef CONFIG_GENERIC_CMOS_UPDATE int __weak update_persistent_clock(struct timespec now) { return -ENODEV; } int __weak update_persistent_clock64(struct timespec64 now64) { struct timespec now; now = timespec64_to_timespec(now64); return update_persistent_clock(now); } #endif #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_cmos_clock(struct work_struct *work); static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); static void sync_cmos_clock(struct work_struct *work) { struct timespec64 now; struct timespec64 next; int fail = 1; /* * If we have an externally synchronized Linux clock, then update * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be * called as close as possible to 500 ms before the new second starts. * This code is run on a timer. If the clock is set, that timer * may not expire at the correct time. Thus, we adjust... * We want the clock to be within a couple of ticks from the target. */ if (!ntp_synced()) { /* * Not synced, exit, do not restart a timer (if one is * running, let it run out). */ return; } getnstimeofday64(&now); if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) { struct timespec64 adjust = now; fail = -ENODEV; if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); #ifdef CONFIG_GENERIC_CMOS_UPDATE fail = update_persistent_clock64(adjust); #endif #ifdef CONFIG_RTC_SYSTOHC if (fail == -ENODEV) fail = rtc_set_ntp_time(adjust); #endif } next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; if (!fail || fail == -ENODEV) next.tv_sec = 659; else next.tv_sec = 0; if (next.tv_nsec >= NSEC_PER_SEC) { next.tv_sec++; next.tv_nsec -= NSEC_PER_SEC; } queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, timespec64_to_jiffies(&next)); } void ntp_notify_cmos_timer(void) { queue_delayed_work(system_power_efficient_wq, &sync_cmos_work, 0); } #else void ntp_notify_cmos_timer(void) { } #endif /* * Propagate a new txc->status value into the NTP state: */ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) { if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; ntp_next_leap_sec = TIME64_MAX; /* restart PPS frequency calibration */ pps_reset_freq_interval(); } /* * If we turn on PLL adjustments then reset the * reference time to current time. */ if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) time_reftime = __ktime_get_real_seconds(); /* only set allowed bits */ time_status &= STA_RONLY; time_status |= txc->status & ~STA_RONLY; } static inline void process_adjtimex_modes(struct timex *txc, struct timespec64 *ts, s32 *time_tai) { if (txc->modes & ADJ_STATUS) process_adj_status(txc, ts); if (txc->modes & ADJ_NANO) time_status |= STA_NANO; if (txc->modes & ADJ_MICRO) time_status &= ~STA_NANO; if (txc->modes & ADJ_FREQUENCY) { time_freq = txc->freq * PPM_SCALE; time_freq = min(time_freq, MAXFREQ_SCALED); time_freq = max(time_freq, -MAXFREQ_SCALED); /* update pps_freq */ pps_set_freq(time_freq); } if (txc->modes & ADJ_MAXERROR) time_maxerror = txc->maxerror; if (txc->modes & ADJ_ESTERROR) time_esterror = txc->esterror; if (txc->modes & ADJ_TIMECONST) { time_constant = txc->constant; if (!(time_status & STA_NANO)) time_constant += 4; time_constant = min(time_constant, (long)MAXTC); time_constant = max(time_constant, 0l); } if (txc->modes & ADJ_TAI && txc->constant >= 0 && txc->constant <= MAX_TAI_OFFSET) *time_tai = txc->constant; if (txc->modes & ADJ_OFFSET) ntp_update_offset(txc->offset); if (txc->modes & ADJ_TICK) tick_usec = txc->tick; if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) ntp_update_frequency(); } /** * ntp_validate_timex - Ensures the timex is ok for use in do_adjtimex */ int ntp_validate_timex(struct timex *txc) { if (txc->modes & ADJ_ADJTIME) { /* singleshot must not be used with any other mode bits */ if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) return -EINVAL; if (!(txc->modes & ADJ_OFFSET_READONLY) && !capable(CAP_SYS_TIME)) return -EPERM; } else { /* In order to modify anything, you gotta be super-user! */ if (txc->modes && !capable(CAP_SYS_TIME)) return -EPERM; /* * if the quartz is off by more than 10% then * something is VERY wrong! */ if (txc->modes & ADJ_TICK && (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)) return -EINVAL; } if (txc->modes & ADJ_SETOFFSET) { /* In order to inject time, you gotta be super-user! */ if (!capable(CAP_SYS_TIME)) return -EPERM; if (txc->modes & ADJ_NANO) { struct timespec ts; ts.tv_sec = txc->time.tv_sec; ts.tv_nsec = txc->time.tv_usec; if (!timespec_inject_offset_valid(&ts)) return -EINVAL; } else { if (!timeval_inject_offset_valid(&txc->time)) return -EINVAL; } } /* * Check for potential multiplication overflows that can * only happen on 64-bit systems: */ if ((txc->modes & ADJ_FREQUENCY) && (BITS_PER_LONG == 64)) { if (LLONG_MIN / PPM_SCALE > txc->freq) return -EINVAL; if (LLONG_MAX / PPM_SCALE < txc->freq) return -EINVAL; } return 0; } /* * adjtimex mainly allows reading (and writing, if superuser) of * kernel time-keeping variables. used by xntpd. */ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) { int result; if (txc->modes & ADJ_ADJTIME) { long save_adjust = time_adjust; if (!(txc->modes & ADJ_OFFSET_READONLY)) { /* adjtime() is independent from ntp_adjtime() */ time_adjust = txc->offset; ntp_update_frequency(); } txc->offset = save_adjust; } else { /* If there are input parameters, then process them: */ if (txc->modes) process_adjtimex_modes(txc, ts, time_tai); txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, NTP_SCALE_SHIFT); if (!(time_status & STA_NANO)) txc->offset /= NSEC_PER_USEC; } result = time_state; /* mostly `TIME_OK' */ /* check for errors */ if (is_error_status(time_status)) result = TIME_ERROR; txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * PPM_SCALE_INV, NTP_SCALE_SHIFT); txc->maxerror = time_maxerror; txc->esterror = time_esterror; txc->status = time_status; txc->constant = time_constant; txc->precision = 1; txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; txc->tick = tick_usec; txc->tai = *time_tai; /* fill PPS status fields */ pps_fill_timex(txc); txc->time.tv_sec = (time_t)ts->tv_sec; txc->time.tv_usec = ts->tv_nsec; if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; /* Handle leapsec adjustments */ if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { if ((time_state == TIME_INS) && (time_status & STA_INS)) { result = TIME_OOP; txc->tai++; txc->time.tv_sec--; } if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { result = TIME_WAIT; txc->tai--; txc->time.tv_sec++; } if ((time_state == TIME_OOP) && (ts->tv_sec == ntp_next_leap_sec)) { result = TIME_WAIT; } } return result; } #ifdef CONFIG_NTP_PPS /* actually struct pps_normtime is good old struct timespec, but it is * semantically different (and it is the reason why it was invented): * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ struct pps_normtime { s64 sec; /* seconds */ long nsec; /* nanoseconds */ }; /* normalize the timestamp so that nsec is in the ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ static inline struct pps_normtime pps_normalize_ts(struct timespec64 ts) { struct pps_normtime norm = { .sec = ts.tv_sec, .nsec = ts.tv_nsec }; if (norm.nsec > (NSEC_PER_SEC >> 1)) { norm.nsec -= NSEC_PER_SEC; norm.sec++; } return norm; } /* get current phase correction and jitter */ static inline long pps_phase_filter_get(long *jitter) { *jitter = pps_tf[0] - pps_tf[1]; if (*jitter < 0) *jitter = -*jitter; /* TODO: test various filters */ return pps_tf[0]; } /* add the sample to the phase filter */ static inline void pps_phase_filter_add(long err) { pps_tf[2] = pps_tf[1]; pps_tf[1] = pps_tf[0]; pps_tf[0] = err; } /* decrease frequency calibration interval length. * It is halved after four consecutive unstable intervals. */ static inline void pps_dec_freq_interval(void) { if (--pps_intcnt <= -PPS_INTCOUNT) { pps_intcnt = -PPS_INTCOUNT; if (pps_shift > PPS_INTMIN) { pps_shift--; pps_intcnt = 0; } } } /* increase frequency calibration interval length. * It is doubled after four consecutive stable intervals. */ static inline void pps_inc_freq_interval(void) { if (++pps_intcnt >= PPS_INTCOUNT) { pps_intcnt = PPS_INTCOUNT; if (pps_shift < PPS_INTMAX) { pps_shift++; pps_intcnt = 0; } } } /* update clock frequency based on MONOTONIC_RAW clock PPS signal * timestamps * * At the end of the calibration interval the difference between the * first and last MONOTONIC_RAW clock timestamps divided by the length * of the interval becomes the frequency update. If the interval was * too long, the data are discarded. * Returns the difference between old and new frequency values. */ static long hardpps_update_freq(struct pps_normtime freq_norm) { long delta, delta_mod; s64 ftemp; /* check if the frequency interval was too long */ if (freq_norm.sec > (2 << pps_shift)) { time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); printk_deferred(KERN_ERR "hardpps: PPSERROR: interval too long - %lld s\n", freq_norm.sec); return 0; } /* here the raw frequency offset and wander (stability) is * calculated. If the wander is less than the wander threshold * the interval is increased; otherwise it is decreased. */ ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, freq_norm.sec); delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { printk_deferred(KERN_WARNING "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); } else { /* good sample */ pps_inc_freq_interval(); } /* the stability metric is calculated as the average of recent * frequency changes, but is used only for performance * monitoring */ delta_mod = delta; if (delta_mod < 0) delta_mod = -delta_mod; pps_stabil += (div_s64(((s64)delta_mod) << (NTP_SCALE_SHIFT - SHIFT_USEC), NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; /* if enabled, the system clock frequency is updated */ if ((time_status & STA_PPSFREQ) != 0 && (time_status & STA_FREQHOLD) == 0) { time_freq = pps_freq; ntp_update_frequency(); } return delta; } /* correct REALTIME clock phase error against PPS signal */ static void hardpps_update_phase(long error) { long correction = -error; long jitter; /* add the sample to the median filter */ pps_phase_filter_add(correction); correction = pps_phase_filter_get(&jitter); /* Nominal jitter is due to PPS signal noise. If it exceeds the * threshold, the sample is discarded; otherwise, if so enabled, * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { printk_deferred(KERN_WARNING "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { /* correct the time using the phase offset */ time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); /* cancel running adjtime() */ time_adjust = 0; } /* update jitter */ pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; } /* * __hardpps() - discipline CPU clock oscillator to external PPS signal * * This routine is called at each PPS signal arrival in order to * discipline the CPU clock oscillator to the PPS signal. It takes two * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former * is used to correct clock phase error and the latter is used to * correct the frequency. * * This code is based on David Mills's reference nanokernel * implementation. It was mostly rewritten but keeps the same idea. */ void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) { struct pps_normtime pts_norm, freq_norm; pts_norm = pps_normalize_ts(*phase_ts); /* clear the error bits, they will be set again if needed */ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); /* indicate signal presence */ time_status |= STA_PPSSIGNAL; pps_valid = PPS_VALID; /* when called for the first time, * just start the frequency interval */ if (unlikely(pps_fbase.tv_sec == 0)) { pps_fbase = *raw_ts; return; } /* ok, now we have a base for frequency calculation */ freq_norm = pps_normalize_ts(timespec64_sub(*raw_ts, pps_fbase)); /* check that the signal is in the range * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ if ((freq_norm.sec == 0) || (freq_norm.nsec > MAXFREQ * freq_norm.sec) || (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } /* signal is ok */ /* check if the current frequency interval is finished */ if (freq_norm.sec >= (1 << pps_shift)) { pps_calcnt++; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; hardpps_update_freq(freq_norm); } hardpps_update_phase(pts_norm.nsec); } #endif /* CONFIG_NTP_PPS */ static int __init ntp_tick_adj_setup(char *str) { int rc = kstrtol(str, 0, (long *)&ntp_tick_adj); if (rc) return rc; ntp_tick_adj <<= NTP_SCALE_SHIFT; return 1; } __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); }
3 3 1 3 3 3 1 3 3 3 3 3 2 2 2 2 2 2 2 2 2 1 1 2 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 /* * linux/fs/ufs/super.c * * Copyright (C) 1998 * Daniel Pirkl <daniel.pirkl@email.cz> * Charles University, Faculty of Mathematics and Physics */ /* Derived from * * linux/fs/ext2/super.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ /* * Inspired by * * linux/fs/ufs/super.c * * Copyright (C) 1996 * Adrian Rodriguez (adrian@franklins-tower.rutgers.edu) * Laboratory for Computer Science Research Computing Facility * Rutgers, The State University of New Jersey * * Copyright (C) 1996 Eddie C. Dost (ecd@skynet.be) * * Kernel module support added on 96/04/26 by * Stefan Reinauer <stepan@home.culture.mipt.ru> * * Module usage counts added on 96/04/29 by * Gertjan van Wingerde <gwingerde@gmail.com> * * Clean swab support on 19970406 by * Francois-Rene Rideau <fare@tunes.org> * * 4.4BSD (FreeBSD) support added on February 1st 1998 by * Niels Kristian Bech Jensen <nkbj@image.dk> partially based * on code by Martin von Loewis <martin@mira.isdn.cs.tu-berlin.de>. * * NeXTstep support added on February 5th 1998 by * Niels Kristian Bech Jensen <nkbj@image.dk>. * * write support Daniel Pirkl <daniel.pirkl@email.cz> 1998 * * HP/UX hfs filesystem support added by * Martin K. Petersen <mkp@mkp.net>, August 1999 * * UFS2 (of FreeBSD 5.x) support added by * Niraj Kumar <niraj17@iitbombay.org>, Jan 2004 * * UFS2 write support added by * Evgeniy Dushistov <dushistov@mail.ru>, 2007 */ #include <linux/exportfs.h> #include <linux/module.h> #include <linux/bitops.h> #include <stdarg.h> #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/log2.h> #include <linux/mount.h> #include <linux/seq_file.h> #include "ufs_fs.h" #include "ufs.h" #include "swab.h" #include "util.h" static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct inode *inode; if (ino < UFS_ROOTINO || ino > (u64)uspi->s_ncg * uspi->s_ipg) return ERR_PTR(-ESTALE); inode = ufs_iget(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return inode; } static struct dentry *ufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); } static struct dentry *ufs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ufs_nfs_get_inode); } static struct dentry *ufs_get_parent(struct dentry *child) { struct qstr dot_dot = QSTR_INIT("..", 2); ino_t ino; ino = ufs_inode_by_name(d_inode(child), &dot_dot); if (!ino) return ERR_PTR(-ENOENT); return d_obtain_alias(ufs_iget(child->d_sb, ino)); } static const struct export_operations ufs_export_ops = { .fh_to_dentry = ufs_fh_to_dentry, .fh_to_parent = ufs_fh_to_parent, .get_parent = ufs_get_parent, }; #ifdef CONFIG_UFS_DEBUG /* * Print contents of ufs_super_block, useful for debugging */ static void ufs_print_super_stuff(struct super_block *sb, struct ufs_super_block_first *usb1, struct ufs_super_block_second *usb2, struct ufs_super_block_third *usb3) { u32 magic = fs32_to_cpu(sb, usb3->fs_magic); pr_debug("ufs_print_super_stuff\n"); pr_debug(" magic: 0x%x\n", magic); if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { pr_debug(" fs_size: %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); pr_debug(" fs_dsize: %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); pr_debug(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); pr_debug(" fs_sblockloc: %llu\n", (unsigned long long) fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); pr_debug(" cs_ndir(No of dirs): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); pr_debug(" cs_nbfree(No of free blocks): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); pr_info(" cs_nifree(Num of free inodes): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); pr_info(" cs_nffree(Num of free frags): %llu\n", (unsigned long long) fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); pr_info(" fs_maxsymlinklen: %u\n", fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); } else { pr_debug(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); pr_debug(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); pr_debug(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); pr_debug(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); pr_debug(" cgoffset: %u\n", fs32_to_cpu(sb, usb1->fs_cgoffset)); pr_debug(" ~cgmask: 0x%x\n", ~fs32_to_cpu(sb, usb1->fs_cgmask)); pr_debug(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); pr_debug(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); pr_debug(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); pr_debug(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); pr_debug(" fragshift: %u\n", fs32_to_cpu(sb, usb1->fs_fragshift)); pr_debug(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); pr_debug(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); pr_debug(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); pr_debug(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); pr_debug(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); pr_debug(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); pr_debug(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); pr_debug(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); pr_debug(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); pr_debug(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); pr_debug(" fstodb: %u\n", fs32_to_cpu(sb, usb1->fs_fsbtodb)); pr_debug(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); pr_debug(" ndir %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); pr_debug(" nifree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); pr_debug(" nbfree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); pr_debug(" nffree %u\n", fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); } pr_debug("\n"); } /* * Print contents of ufs_cylinder_group, useful for debugging */ static void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg) { pr_debug("\nufs_print_cylinder_stuff\n"); pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); pr_debug(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); pr_debug(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); pr_debug(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); pr_debug(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); pr_debug(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); pr_debug(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); pr_debug(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); pr_debug(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); pr_debug(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); pr_debug(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); pr_debug(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); pr_debug(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); pr_debug(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); pr_debug(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]), fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]), fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]), fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7])); pr_debug(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); pr_debug(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); pr_debug(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); pr_debug(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); pr_debug(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); pr_debug(" clustersumoff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); pr_debug(" clusteroff %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); pr_debug(" nclusterblks %u\n", fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); pr_debug("\n"); } #else # define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ # define ufs_print_cylinder_stuff(sb, cg) /**/ #endif /* CONFIG_UFS_DEBUG */ static const struct super_operations ufs_super_ops; void ufs_error (struct super_block * sb, const char * function, const char * fmt, ...) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); if (!sb_rdonly(sb)) { usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); sb->s_flags |= MS_RDONLY; } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { case UFS_MOUNT_ONERROR_PANIC: panic("panic (device %s): %s: %pV\n", sb->s_id, function, &vaf); case UFS_MOUNT_ONERROR_LOCK: case UFS_MOUNT_ONERROR_UMOUNT: case UFS_MOUNT_ONERROR_REPAIR: pr_crit("error (device %s): %s: %pV\n", sb->s_id, function, &vaf); } va_end(args); } void ufs_panic (struct super_block * sb, const char * function, const char * fmt, ...) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); if (!sb_rdonly(sb)) { usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); } va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; sb->s_flags |= MS_RDONLY; pr_crit("panic (device %s): %s: %pV\n", sb->s_id, function, &vaf); va_end(args); } void ufs_warning (struct super_block * sb, const char * function, const char * fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_warn("(device %s): %s: %pV\n", sb->s_id, function, &vaf); va_end(args); } enum { Opt_type_old = UFS_MOUNT_UFSTYPE_OLD, Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86, Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN, Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS, Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD, Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2, Opt_type_hp = UFS_MOUNT_UFSTYPE_HP, Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD, Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP, Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP, Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC, Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK, Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT, Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR, Opt_err }; static const match_table_t tokens = { {Opt_type_old, "ufstype=old"}, {Opt_type_sunx86, "ufstype=sunx86"}, {Opt_type_sun, "ufstype=sun"}, {Opt_type_sunos, "ufstype=sunos"}, {Opt_type_44bsd, "ufstype=44bsd"}, {Opt_type_ufs2, "ufstype=ufs2"}, {Opt_type_ufs2, "ufstype=5xbsd"}, {Opt_type_hp, "ufstype=hp"}, {Opt_type_nextstepcd, "ufstype=nextstep-cd"}, {Opt_type_nextstep, "ufstype=nextstep"}, {Opt_type_openstep, "ufstype=openstep"}, /*end of possible ufs types */ {Opt_onerror_panic, "onerror=panic"}, {Opt_onerror_lock, "onerror=lock"}, {Opt_onerror_umount, "onerror=umount"}, {Opt_onerror_repair, "onerror=repair"}, {Opt_err, NULL} }; static int ufs_parse_options (char * options, unsigned * mount_options) { char * p; UFSD("ENTER\n"); if (!options) return 1; while ((p = strsep(&options, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_type_old: ufs_clear_opt (*mount_options, UFSTYPE); ufs_set_opt (*mount_options, UFSTYPE_OLD); break; case Opt_type_sunx86: ufs_clear_opt (*mount_options, UFSTYPE); ufs_set_opt (*mount_options, UFSTYPE_SUNx86); break; case Opt_type_sun: ufs_clear_opt (*mount_options, UFSTYPE); ufs_set_opt (*mount_options, UFSTYPE_SUN); break; case Opt_type_sunos: ufs_clear_opt(*mount_options, UFSTYPE); ufs_set_opt(*mount_options, UFSTYPE_SUNOS); break; case Opt_type_44bsd: ufs_clear_opt (*mount_options, UFSTYPE); ufs_set_opt (*mount_options, UFSTYPE_44BSD); break; case Opt_type_ufs2: ufs_clear_opt(*mount_options, UFSTYPE); ufs_set_opt(*mount_options, UFSTYPE_UFS2); break; case Opt_type_hp: ufs_clear_opt (*mount_options, UFSTYPE); ufs_set_opt (*mount_options, UFSTYPE_HP); break; case Opt_type_nextstepcd: ufs_clear_opt (*mount_options, UFSTYPE); ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP_CD); break; case Opt_type_nextstep: ufs_clear_opt (*mount_options, UFSTYPE); ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP); break; case Opt_type_openstep: ufs_clear_opt (*mount_options, UFSTYPE); ufs_set_opt (*mount_options, UFSTYPE_OPENSTEP); break; case Opt_onerror_panic: ufs_clear_opt (*mount_options, ONERROR); ufs_set_opt (*mount_options, ONERROR_PANIC); break; case Opt_onerror_lock: ufs_clear_opt (*mount_options, ONERROR); ufs_set_opt (*mount_options, ONERROR_LOCK); break; case Opt_onerror_umount: ufs_clear_opt (*mount_options, ONERROR); ufs_set_opt (*mount_options, ONERROR_UMOUNT); break; case Opt_onerror_repair: pr_err("Unable to do repair on error, will lock lock instead\n"); ufs_clear_opt (*mount_options, ONERROR); ufs_set_opt (*mount_options, ONERROR_REPAIR); break; default: pr_err("Invalid option: \"%s\" or missing value\n", p); return 0; } } return 1; } /* * Different types of UFS hold fs_cstotal in different * places, and use different data structure for it. * To make things simpler we just copy fs_cstotal to ufs_sb_private_info */ static void ufs_setup_cstotal(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; UFSD("ENTER, mtype=%u\n", mtype); usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) || mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir); uspi->cs_total.cs_nbfree = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree); uspi->cs_total.cs_nifree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree); uspi->cs_total.cs_nffree = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree); } else { uspi->cs_total.cs_ndir = fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir); uspi->cs_total.cs_nbfree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree); uspi->cs_total.cs_nifree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree); uspi->cs_total.cs_nffree = fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree); } UFSD("EXIT\n"); } /* * Read on-disk structures associated with cylinder groups */ static int ufs_read_cylinder_structures(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned size, blks, i; UFSD("ENTER\n"); /* * Read cs structures from (usually) first data block * on the device. */ size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; base = space = kmalloc(size, GFP_NOFS); if (!base) goto failed; sbi->s_csp = (struct ufs_csum *)space; for (i = 0; i < blks; i += uspi->s_fpb) { size = uspi->s_bsize; if (i + uspi->s_fpb > blks) size = (blks - i) * uspi->s_fsize; ubh = ubh_bread(sb, uspi->s_csaddr + i, size); if (!ubh) goto failed; ubh_ubhcpymem (space, ubh, size); space += size; ubh_brelse (ubh); ubh = NULL; } /* * Read cylinder group (we read only first fragment from block * at this time) and prepare internal data structures for cg caching. */ if (!(sbi->s_ucg = kmalloc (sizeof(struct buffer_head *) * uspi->s_ncg, GFP_NOFS))) goto failed; for (i = 0; i < uspi->s_ncg; i++) sbi->s_ucg[i] = NULL; for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { sbi->s_ucpi[i] = NULL; sbi->s_cgno[i] = UFS_CGNO_EMPTY; } for (i = 0; i < uspi->s_ncg; i++) { UFSD("read cg %u\n", i); if (!(sbi->s_ucg[i] = sb_bread(sb, ufs_cgcmin(i)))) goto failed; if (!ufs_cg_chkmagic (sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data)) goto failed; ufs_print_cylinder_stuff(sb, (struct ufs_cylinder_group *) sbi->s_ucg[i]->b_data); } for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) { if (!(sbi->s_ucpi[i] = kmalloc (sizeof(struct ufs_cg_private_info), GFP_NOFS))) goto failed; sbi->s_cgno[i] = UFS_CGNO_EMPTY; } sbi->s_cg_loaded = 0; UFSD("EXIT\n"); return 1; failed: kfree (base); if (sbi->s_ucg) { for (i = 0; i < uspi->s_ncg; i++) if (sbi->s_ucg[i]) brelse (sbi->s_ucg[i]); kfree (sbi->s_ucg); for (i = 0; i < UFS_MAX_GROUP_LOADED; i++) kfree (sbi->s_ucpi[i]); } UFSD("EXIT (FAILED)\n"); return 0; } /* * Sync our internal copy of fs_cstotal with disk */ static void ufs_put_cstotal(struct super_block *sb) { unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; UFSD("ENTER\n"); usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); if (mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ usb2->fs_un.fs_u2.cs_ndir = cpu_to_fs64(sb, uspi->cs_total.cs_ndir); usb2->fs_un.fs_u2.cs_nbfree = cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); usb3->fs_un1.fs_u2.cs_nifree = cpu_to_fs64(sb, uspi->cs_total.cs_nifree); usb3->fs_un1.fs_u2.cs_nffree = cpu_to_fs64(sb, uspi->cs_total.cs_nffree); goto out; } if (mtype == UFS_MOUNT_UFSTYPE_44BSD && (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) { /* store stats in both old and new places */ usb2->fs_un.fs_u2.cs_ndir = cpu_to_fs64(sb, uspi->cs_total.cs_ndir); usb2->fs_un.fs_u2.cs_nbfree = cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); usb3->fs_un1.fs_u2.cs_nifree = cpu_to_fs64(sb, uspi->cs_total.cs_nifree); usb3->fs_un1.fs_u2.cs_nffree = cpu_to_fs64(sb, uspi->cs_total.cs_nffree); } usb1->fs_cstotal.cs_ndir = cpu_to_fs32(sb, uspi->cs_total.cs_ndir); usb1->fs_cstotal.cs_nbfree = cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); usb1->fs_cstotal.cs_nifree = cpu_to_fs32(sb, uspi->cs_total.cs_nifree); usb1->fs_cstotal.cs_nffree = cpu_to_fs32(sb, uspi->cs_total.cs_nffree); out: ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_print_super_stuff(sb, usb1, usb2, usb3); UFSD("EXIT\n"); } /** * ufs_put_super_internal() - put on-disk intrenal structures * @sb: pointer to super_block structure * Put on-disk structures associated with cylinder groups * and write them back to disk, also update cs_total on disk */ static void ufs_put_super_internal(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); struct ufs_sb_private_info *uspi = sbi->s_uspi; struct ufs_buffer_head * ubh; unsigned char * base, * space; unsigned blks, size, i; UFSD("ENTER\n"); ufs_put_cstotal(sb); size = uspi->s_cssize; blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift; base = space = (char*) sbi->s_csp; for (i = 0; i < blks; i += uspi->s_fpb) { size = uspi->s_bsize; if (i + uspi->s_fpb > blks) size = (blks - i) * uspi->s_fsize; ubh = ubh_bread(sb, uspi->s_csaddr + i, size); ubh_memcpyubh (ubh, space, size); space += size; ubh_mark_buffer_uptodate (ubh, 1); ubh_mark_buffer_dirty (ubh); ubh_brelse (ubh); } for (i = 0; i < sbi->s_cg_loaded; i++) { ufs_put_cylinder (sb, i); kfree (sbi->s_ucpi[i]); } for (; i < UFS_MAX_GROUP_LOADED; i++) kfree (sbi->s_ucpi[i]); for (i = 0; i < uspi->s_ncg; i++) brelse (sbi->s_ucg[i]); kfree (sbi->s_ucg); kfree (base); UFSD("EXIT\n"); } static int ufs_sync_fs(struct super_block *sb, int wait) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_third * usb3; unsigned flags; mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); flags = UFS_SB(sb)->s_flags; uspi = UFS_SB(sb)->s_uspi; usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); usb1->fs_time = cpu_to_fs32(sb, get_seconds()); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); ufs_put_cstotal(sb); UFSD("EXIT\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } static void delayed_sync_fs(struct work_struct *work) { struct ufs_sb_info *sbi; sbi = container_of(work, struct ufs_sb_info, sync_work.work); spin_lock(&sbi->work_lock); sbi->work_queued = 0; spin_unlock(&sbi->work_lock); ufs_sync_fs(sbi->sb, 1); } void ufs_mark_sb_dirty(struct super_block *sb) { struct ufs_sb_info *sbi = UFS_SB(sb); unsigned long delay; spin_lock(&sbi->work_lock); if (!sbi->work_queued) { delay = msecs_to_jiffies(dirty_writeback_interval * 10); queue_delayed_work(system_long_wq, &sbi->sync_work, delay); sbi->work_queued = 1; } spin_unlock(&sbi->work_lock); } static void ufs_put_super(struct super_block *sb) { struct ufs_sb_info * sbi = UFS_SB(sb); UFSD("ENTER\n"); if (!sb_rdonly(sb)) ufs_put_super_internal(sb); cancel_delayed_work_sync(&sbi->sync_work); ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); return; } static u64 ufs_max_bytes(struct super_block *sb) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; int bits = uspi->s_apbshift; u64 res; if (bits > 21) res = ~0ULL; else res = UFS_NDADDR + (1LL << bits) + (1LL << (2*bits)) + (1LL << (3*bits)); if (res >= (MAX_LFS_FILESIZE >> uspi->s_bshift)) return MAX_LFS_FILESIZE; return res << uspi->s_bshift; } static int ufs_fill_super(struct super_block *sb, void *data, int silent) { struct ufs_sb_info * sbi; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_second * usb2; struct ufs_super_block_third * usb3; struct ufs_buffer_head * ubh; struct inode *inode; unsigned block_size, super_block_size; unsigned flags; unsigned super_block_offset; unsigned maxsymlen; int ret = -EINVAL; uspi = NULL; ubh = NULL; flags = 0; UFSD("ENTER\n"); #ifndef CONFIG_UFS_FS_WRITE if (!sb_rdonly(sb)) { pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); return -EROFS; } #endif sbi = kzalloc(sizeof(struct ufs_sb_info), GFP_KERNEL); if (!sbi) goto failed_nomem; sb->s_fs_info = sbi; sbi->sb = sb; UFSD("flag %u\n", (int)(sb_rdonly(sb))); mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* * Set default mount options * Parse mount options */ sbi->s_mount_opt = 0; ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { pr_err("wrong mount options\n"); goto failed; } if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { if (!silent) pr_err("You didn't specify the type of your ufs filesystem\n\n" "mount -t ufs -o ufstype=" "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " "default is ufstype=old\n"); ufs_set_opt (sbi->s_mount_opt, UFSTYPE_OLD); } uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL); sbi->s_uspi = uspi; if (!uspi) goto failed; uspi->s_dirblksize = UFS_SECTOR_SIZE; super_block_offset=UFS_SBLOCK; sb->s_maxbytes = MAX_LFS_FILESIZE; switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); uspi->s_fsize = block_size = 512; uspi->s_fmask = ~(512 - 1); uspi->s_fshift = 9; uspi->s_sbsize = super_block_size = 1536; uspi->s_sbbase = 0; flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; break; case UFS_MOUNT_UFSTYPE_UFS2: UFSD("ufstype=ufs2\n"); super_block_offset=SBLOCK_UFS2; uspi->s_fsize = block_size = 512; uspi->s_fmask = ~(512 - 1); uspi->s_fshift = 9; uspi->s_sbsize = super_block_size = 1536; uspi->s_sbbase = 0; flags |= UFS_TYPE_UFS2 | UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; break; case UFS_MOUNT_UFSTYPE_SUN: UFSD("ufstype=sun\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_maxsymlinklen = 0; /* Not supported on disk */ flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUN | UFS_CG_SUN; break; case UFS_MOUNT_UFSTYPE_SUNOS: UFSD("ufstype=sunos\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = 2048; super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_maxsymlinklen = 0; /* Not supported on disk */ flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_SUNOS | UFS_CG_SUN; break; case UFS_MOUNT_UFSTYPE_SUNx86: UFSD("ufstype=sunx86\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_maxsymlinklen = 0; /* Not supported on disk */ flags |= UFS_DE_OLD | UFS_UID_EFT | UFS_ST_SUNx86 | UFS_CG_SUN; break; case UFS_MOUNT_UFSTYPE_OLD: UFSD("ufstype=old\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=old is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; case UFS_MOUNT_UFSTYPE_NEXTSTEP: UFSD("ufstype=nextstep\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; case UFS_MOUNT_UFSTYPE_NEXTSTEP_CD: UFSD("ufstype=nextstep-cd\n"); uspi->s_fsize = block_size = 2048; uspi->s_fmask = ~(2048 - 1); uspi->s_fshift = 11; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep-cd is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; case UFS_MOUNT_UFSTYPE_OPENSTEP: UFSD("ufstype=openstep\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; uspi->s_dirblksize = 1024; flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=openstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; case UFS_MOUNT_UFSTYPE_HP: UFSD("ufstype=hp\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; uspi->s_sbsize = super_block_size = 2048; uspi->s_sbbase = 0; flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=hp is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; default: if (!silent) pr_err("unknown ufstype\n"); goto failed; } again: if (!sb_set_blocksize(sb, block_size)) { pr_err("failed to set blocksize\n"); goto failed; } /* * read ufs super block from device */ ubh = ubh_bread_uspi(uspi, sb, uspi->s_sbbase + super_block_offset/block_size, super_block_size); if (!ubh) goto failed; usb1 = ubh_get_usb_first(uspi); usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); /* Sort out mod used on SunOS 4.1.3 for fs_state */ uspi->s_postblformat = fs32_to_cpu(sb, usb3->fs_postblformat); if (((flags & UFS_ST_MASK) == UFS_ST_SUNOS) && (uspi->s_postblformat != UFS_42POSTBLFMT)) { flags &= ~UFS_ST_MASK; flags |= UFS_ST_SUN; } if ((flags & UFS_ST_MASK) == UFS_ST_44BSD && uspi->s_postblformat == UFS_42POSTBLFMT) { if (!silent) pr_err("this is not a 44bsd filesystem"); goto failed; } /* * Check ufs magic number */ sbi->s_bytesex = BYTESEX_LE; switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { case UFS_MAGIC: case UFS_MAGIC_BW: case UFS2_MAGIC: case UFS_MAGIC_LFN: case UFS_MAGIC_FEA: case UFS_MAGIC_4GB: goto magic_found; } sbi->s_bytesex = BYTESEX_BE; switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) { case UFS_MAGIC: case UFS_MAGIC_BW: case UFS2_MAGIC: case UFS_MAGIC_LFN: case UFS_MAGIC_FEA: case UFS_MAGIC_4GB: goto magic_found; } if ((((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP) || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD) || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_OPENSTEP)) && uspi->s_sbbase < 256) { ubh_brelse_uspi(uspi); ubh = NULL; uspi->s_sbbase += 8; goto again; } if (!silent) pr_err("%s(): bad magic number\n", __func__); goto failed; magic_found: /* * Check block and fragment sizes */ uspi->s_bsize = fs32_to_cpu(sb, usb1->fs_bsize); uspi->s_fsize = fs32_to_cpu(sb, usb1->fs_fsize); uspi->s_sbsize = fs32_to_cpu(sb, usb1->fs_sbsize); uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); if (!is_power_of_2(uspi->s_fsize)) { pr_err("%s(): fragment size %u is not a power of 2\n", __func__, uspi->s_fsize); goto failed; } if (uspi->s_fsize < 512) { pr_err("%s(): fragment size %u is too small\n", __func__, uspi->s_fsize); goto failed; } if (uspi->s_fsize > 4096) { pr_err("%s(): fragment size %u is too large\n", __func__, uspi->s_fsize); goto failed; } if (!is_power_of_2(uspi->s_bsize)) { pr_err("%s(): block size %u is not a power of 2\n", __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize < 4096) { pr_err("%s(): block size %u is too small\n", __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize / uspi->s_fsize > 8) { pr_err("%s(): too many fragments per block (%u)\n", __func__, uspi->s_bsize / uspi->s_fsize); goto failed; } if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) { ubh_brelse_uspi(uspi); ubh = NULL; block_size = uspi->s_fsize; super_block_size = uspi->s_sbsize; UFSD("another value of block_size or super_block_size %u, %u\n", block_size, super_block_size); goto again; } sbi->s_flags = flags;/*after that line some functions use s_flags*/ ufs_print_super_stuff(sb, usb1, usb2, usb3); /* * Check, if file system was correctly unmounted. * If not, make it read only. */ if (((flags & UFS_ST_MASK) == UFS_ST_44BSD) || ((flags & UFS_ST_MASK) == UFS_ST_OLD) || (((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) && (ufs_get_fs_state(sb, usb1, usb3) == (UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time))))) { switch(usb1->fs_clean) { case UFS_FSCLEAN: UFSD("fs is clean\n"); break; case UFS_FSSTABLE: UFSD("fs is stable\n"); break; case UFS_FSLOG: UFSD("fs is logging fs\n"); break; case UFS_FSOSF1: UFSD("fs is DEC OSF/1\n"); break; case UFS_FSACTIVE: pr_err("%s(): fs is active\n", __func__); sb->s_flags |= MS_RDONLY; break; case UFS_FSBAD: pr_err("%s(): fs is bad\n", __func__); sb->s_flags |= MS_RDONLY; break; default: pr_err("%s(): can't grok fs_clean 0x%x\n", __func__, usb1->fs_clean); sb->s_flags |= MS_RDONLY; break; } } else { pr_err("%s(): fs needs fsck\n", __func__); sb->s_flags |= MS_RDONLY; } /* * Read ufs_super_block into internal data structures */ sb->s_op = &ufs_super_ops; sb->s_export_op = &ufs_export_ops; sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic); uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno); uspi->s_cblkno = fs32_to_cpu(sb, usb1->fs_cblkno); uspi->s_iblkno = fs32_to_cpu(sb, usb1->fs_iblkno); uspi->s_dblkno = fs32_to_cpu(sb, usb1->fs_dblkno); uspi->s_cgoffset = fs32_to_cpu(sb, usb1->fs_cgoffset); uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { uspi->s_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); uspi->s_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); } else { uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); } uspi->s_ncg = fs32_to_cpu(sb, usb1->fs_ncg); /* s_bsize already set */ /* s_fsize already set */ uspi->s_fpb = fs32_to_cpu(sb, usb1->fs_frag); uspi->s_minfree = fs32_to_cpu(sb, usb1->fs_minfree); uspi->s_bmask = fs32_to_cpu(sb, usb1->fs_bmask); uspi->s_fmask = fs32_to_cpu(sb, usb1->fs_fmask); uspi->s_bshift = fs32_to_cpu(sb, usb1->fs_bshift); uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); UFSD("uspi->s_bshift = %d,uspi->s_fshift = %d", uspi->s_bshift, uspi->s_fshift); uspi->s_fpbshift = fs32_to_cpu(sb, usb1->fs_fragshift); uspi->s_fsbtodb = fs32_to_cpu(sb, usb1->fs_fsbtodb); /* s_sbsize already set */ uspi->s_csmask = fs32_to_cpu(sb, usb1->fs_csmask); uspi->s_csshift = fs32_to_cpu(sb, usb1->fs_csshift); uspi->s_nindir = fs32_to_cpu(sb, usb1->fs_nindir); uspi->s_inopb = fs32_to_cpu(sb, usb1->fs_inopb); uspi->s_nspf = fs32_to_cpu(sb, usb1->fs_nspf); uspi->s_npsect = ufs_get_fs_npsect(sb, usb1, usb3); uspi->s_interleave = fs32_to_cpu(sb, usb1->fs_interleave); uspi->s_trackskew = fs32_to_cpu(sb, usb1->fs_trackskew); if (uspi->fs_magic == UFS2_MAGIC) uspi->s_csaddr = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_csaddr); else uspi->s_csaddr = fs32_to_cpu(sb, usb1->fs_csaddr); uspi->s_cssize = fs32_to_cpu(sb, usb1->fs_cssize); uspi->s_cgsize = fs32_to_cpu(sb, usb1->fs_cgsize); uspi->s_ntrak = fs32_to_cpu(sb, usb1->fs_ntrak); uspi->s_nsect = fs32_to_cpu(sb, usb1->fs_nsect); uspi->s_spc = fs32_to_cpu(sb, usb1->fs_spc); uspi->s_ipg = fs32_to_cpu(sb, usb1->fs_ipg); uspi->s_fpg = fs32_to_cpu(sb, usb1->fs_fpg); uspi->s_cpc = fs32_to_cpu(sb, usb2->fs_un.fs_u1.fs_cpc); uspi->s_contigsumsize = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_contigsumsize); uspi->s_qbmask = ufs_get_fs_qbmask(sb, usb3); uspi->s_qfmask = ufs_get_fs_qfmask(sb, usb3); uspi->s_nrpos = fs32_to_cpu(sb, usb3->fs_nrpos); uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff); uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff); uspi->s_root_blocks = mul_u64_u32_div(uspi->s_dsize, uspi->s_minfree, 100); if (uspi->s_minfree <= 5) { uspi->s_time_to_space = ~0ULL; uspi->s_space_to_time = 0; usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE); } else { uspi->s_time_to_space = (uspi->s_root_blocks / 2) + 1; uspi->s_space_to_time = mul_u64_u32_div(uspi->s_dsize, uspi->s_minfree - 2, 100) - 1; } /* * Compute another frequently used values */ uspi->s_fpbmask = uspi->s_fpb - 1; if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) uspi->s_apbshift = uspi->s_bshift - 3; else uspi->s_apbshift = uspi->s_bshift - 2; uspi->s_2apbshift = uspi->s_apbshift * 2; uspi->s_3apbshift = uspi->s_apbshift * 3; uspi->s_apb = 1 << uspi->s_apbshift; uspi->s_2apb = 1 << uspi->s_2apbshift; uspi->s_3apb = 1 << uspi->s_3apbshift; uspi->s_apbmask = uspi->s_apb - 1; uspi->s_nspfshift = uspi->s_fshift - UFS_SECTOR_BITS; uspi->s_nspb = uspi->s_nspf << uspi->s_fpbshift; uspi->s_inopf = uspi->s_inopb >> uspi->s_fpbshift; uspi->s_bpf = uspi->s_fsize << 3; uspi->s_bpfshift = uspi->s_fshift + 3; uspi->s_bpfmask = uspi->s_bpf - 1; if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD || (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_UFS2) uspi->s_maxsymlinklen = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen); if (uspi->fs_magic == UFS2_MAGIC) maxsymlen = 2 * 4 * (UFS_NDADDR + UFS_NINDIR); else maxsymlen = 4 * (UFS_NDADDR + UFS_NINDIR); if (uspi->s_maxsymlinklen > maxsymlen) { ufs_warning(sb, __func__, "ufs_read_super: excessive maximum " "fast symlink size (%u)\n", uspi->s_maxsymlinklen); uspi->s_maxsymlinklen = maxsymlen; } sb->s_maxbytes = ufs_max_bytes(sb); sb->s_max_links = UFS_LINK_MAX; inode = ufs_iget(sb, UFS_ROOTINO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto failed; } sb->s_root = d_make_root(inode); if (!sb->s_root) { ret = -ENOMEM; goto failed; } ufs_setup_cstotal(sb); /* * Read cylinder group structures */ if (!sb_rdonly(sb)) if (!ufs_read_cylinder_structures(sb)) goto failed; UFSD("EXIT\n"); return 0; failed: if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); kfree(sbi); sb->s_fs_info = NULL; UFSD("EXIT (FAILED)\n"); return ret; failed_nomem: UFSD("EXIT (NOMEM)\n"); return -ENOMEM; } static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_third * usb3; unsigned new_mount_opt, ufstype; unsigned flags; sync_filesystem(sb); mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); /* * Allow the "check" option to be passed as a remount option. * It is not possible to change ufstype option during remount */ ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { pr_err("ufstype can't be changed during remount\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; } if ((bool)(*mount_flags & MS_RDONLY) == sb_rdonly(sb)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } /* * fs was mouted as rw, remounting ro */ if (*mount_flags & MS_RDONLY) { ufs_put_super_internal(sb); usb1->fs_time = cpu_to_fs32(sb, get_seconds()); if ((flags & UFS_ST_MASK) == UFS_ST_SUN || (flags & UFS_ST_MASK) == UFS_ST_SUNOS || (flags & UFS_ST_MASK) == UFS_ST_SUNx86) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); ubh_mark_buffer_dirty (USPI_UBH(uspi)); sb->s_flags |= MS_RDONLY; } else { /* * fs was mounted as ro, remounting rw */ #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && ufstype != UFS_MOUNT_UFSTYPE_SUNOS && ufstype != UFS_MOUNT_UFSTYPE_44BSD && ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); mutex_unlock(&UFS_SB(sb)->s_lock); return -EPERM; } sb->s_flags &= ~MS_RDONLY; #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } static int ufs_show_options(struct seq_file *seq, struct dentry *root) { struct ufs_sb_info *sbi = UFS_SB(root->d_sb); unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; const struct match_token *tp = tokens; while (tp->token != Opt_onerror_panic && tp->token != mval) ++tp; BUG_ON(tp->token == Opt_onerror_panic); seq_printf(seq, ",%s", tp->pattern); mval = sbi->s_mount_opt & UFS_MOUNT_ONERROR; while (tp->token != Opt_err && tp->token != mval) ++tp; BUG_ON(tp->token == Opt_err); seq_printf(seq, ",%s", tp->pattern); return 0; } static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ufs_sb_private_info *uspi= UFS_SB(sb)->s_uspi; unsigned flags = UFS_SB(sb)->s_flags; struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); mutex_lock(&UFS_SB(sb)->s_lock); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) buf->f_type = UFS2_MAGIC; else buf->f_type = UFS_MAGIC; buf->f_blocks = uspi->s_dsize; buf->f_bfree = ufs_freefrags(uspi); buf->f_ffree = uspi->cs_total.cs_nifree; buf->f_bsize = sb->s_blocksize; buf->f_bavail = (buf->f_bfree > uspi->s_root_blocks) ? (buf->f_bfree - uspi->s_root_blocks) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } static struct kmem_cache * ufs_inode_cachep; static struct inode *ufs_alloc_inode(struct super_block *sb) { struct ufs_inode_info *ei; ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS); if (!ei) return NULL; ei->vfs_inode.i_version = 1; seqlock_init(&ei->meta_lock); mutex_init(&ei->truncate_mutex); return &ei->vfs_inode; } static void ufs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(ufs_inode_cachep, UFS_I(inode)); } static void ufs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, ufs_i_callback); } static void init_once(void *foo) { struct ufs_inode_info *ei = (struct ufs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ufs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(ufs_inode_cachep); } static const struct super_operations ufs_super_ops = { .alloc_inode = ufs_alloc_inode, .destroy_inode = ufs_destroy_inode, .write_inode = ufs_write_inode, .evict_inode = ufs_evict_inode, .put_super = ufs_put_super, .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, .remount_fs = ufs_remount, .show_options = ufs_show_options, }; static struct dentry *ufs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super); } static struct file_system_type ufs_fs_type = { .owner = THIS_MODULE, .name = "ufs", .mount = ufs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ufs"); static int __init init_ufs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&ufs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_ufs_fs(void) { unregister_filesystem(&ufs_fs_type); destroy_inodecache(); } module_init(init_ufs_fs) module_exit(exit_ufs_fs) MODULE_LICENSE("GPL");
251 10 85 85 258 258 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include "kmem.h" #include "xfs_message.h" void * kmem_alloc(size_t size, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); void *ptr; do { ptr = kmalloc(size, lflags); if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) xfs_err(NULL, "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", current->comm, current->pid, (unsigned int)size, __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { unsigned nofs_flag = 0; void *ptr; gfp_t lflags; ptr = kmem_zalloc(size, flags | KM_MAYFAIL); if (ptr) return ptr; /* * __vmalloc() will allocate data pages and auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context * here. Hence we need to tell memory reclaim that we are in such a * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); return ptr; } void * kmem_realloc(const void *old, size_t newsize, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); void *ptr; do { ptr = krealloc(old, newsize, lflags); if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) xfs_err(NULL, "%s(%u) possible memory allocation deadlock size %zu in %s (mode:0x%x)", current->comm, current->pid, newsize, __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } void * kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags) { int retries = 0; gfp_t lflags = kmem_flags_convert(flags); void *ptr; do { ptr = kmem_cache_alloc(zone, lflags); if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) xfs_err(NULL, "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)", current->comm, current->pid, __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); }
193 192 193 166 166 166 187 192 193 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 /* * linux/drivers/video/console/softcursor.c * * Generic software cursor for frame buffer devices * * Created 14 Nov 2002 by James Simmons * * This file is subject to the terms and conditions of the GNU General * Public License. See the file COPYING in the main directory of this * archive for more details. */ #include <linux/module.h> #include <linux/string.h> #include <linux/fb.h> #include <linux/slab.h> #include <asm/io.h> #include "fbcon.h" int soft_cursor(struct fb_info *info, struct fb_cursor *cursor) { struct fbcon_ops *ops = info->fbcon_par; unsigned int scan_align = info->pixmap.scan_align - 1; unsigned int buf_align = info->pixmap.buf_align - 1; unsigned int i, size, dsize, s_pitch, d_pitch; struct fb_image *image; u8 *src, *dst; if (info->state != FBINFO_STATE_RUNNING) return 0; s_pitch = (cursor->image.width + 7) >> 3; dsize = s_pitch * cursor->image.height; if (dsize + sizeof(struct fb_image) != ops->cursor_size) { kfree(ops->cursor_src); ops->cursor_size = dsize + sizeof(struct fb_image); ops->cursor_src = kmalloc(ops->cursor_size, GFP_ATOMIC); if (!ops->cursor_src) { ops->cursor_size = 0; return -ENOMEM; } } src = ops->cursor_src + sizeof(struct fb_image); image = (struct fb_image *)ops->cursor_src; *image = cursor->image; d_pitch = (s_pitch + scan_align) & ~scan_align; size = d_pitch * image->height + buf_align; size &= ~buf_align; dst = fb_get_buffer_offset(info, &info->pixmap, size); if (cursor->enable) { switch (cursor->rop) { case ROP_XOR: for (i = 0; i < dsize; i++) src[i] = image->data[i] ^ cursor->mask[i]; break; case ROP_COPY: default: for (i = 0; i < dsize; i++) src[i] = image->data[i] & cursor->mask[i]; break; } } else memcpy(src, image->data, dsize); fb_pad_aligned_buffer(dst, d_pitch, src, s_pitch, image->height); image->data = dst; info->fbops->fb_imageblit(info, image); return 0; } EXPORT_SYMBOL(soft_cursor);
24 1 8 48 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 /* Copyright (C) 2014-2017 B.A.T.M.A.N. contributors: * * Linus Lüssing * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #include "multicast.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/fs.h> #include <linux/icmpv6.h> #include <linux/if_bridge.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/addrconf.h> #include <net/if_inet6.h> #include <net/ip.h> #include <net/ipv6.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "packet.h" #include "send.h" #include "translation-table.h" #include "tvlv.h" static void batadv_mcast_mla_update(struct work_struct *work); /** * batadv_mcast_start_timer - schedule the multicast periodic worker * @bat_priv: the bat priv with all the soft interface information */ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work, msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD)); } /** * batadv_mcast_get_bridge - get the bridge on top of the softif if it exists * @soft_iface: netdev struct of the mesh interface * * If the given soft interface has a bridge on top then the refcount * of the according net device is increased. * * Return: NULL if no such bridge exists. Otherwise the net device of the * bridge. */ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) { struct net_device *upper = soft_iface; rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); if (upper) dev_hold(upper); rcu_read_unlock(); return upper; } /** * batadv_mcast_mla_softif_get - get softif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * * Collects multicast addresses of multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * If there is a bridge interface on top of dev, collects from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_softif_get(struct net_device *dev, struct hlist_head *mcast_list) { struct net_device *bridge = batadv_mcast_get_bridge(dev); struct netdev_hw_addr *mc_list_entry; struct batadv_hw_addr *new; int ret = 0; netif_addr_lock_bh(bridge ? bridge : dev); netdev_for_each_mc_addr(mc_list_entry, bridge ? bridge : dev) { new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mc_list_entry->addr); hlist_add_head(&new->list, mcast_list); ret++; } netif_addr_unlock_bh(bridge ? bridge : dev); if (bridge) dev_put(bridge); return ret; } /** * batadv_mcast_mla_is_duplicate - check whether an address is in a list * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; hlist_for_each_entry(mcast_entry, mcast_list, list) if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) return true; return false; } /** * batadv_mcast_mla_br_addr_cpy - copy a bridge multicast address * @dst: destination to write to - a multicast MAC address * @src: source to read from - a multicast IP address * * Converts a given multicast IPv4/IPv6 address from a bridge * to its matching multicast MAC address and copies it into the given * destination buffer. * * Caller needs to make sure the destination buffer can hold * at least ETH_ALEN bytes. */ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) { if (src->proto == htons(ETH_P_IP)) ip_eth_mc_map(src->u.ip4, dst); #if IS_ENABLED(CONFIG_IPV6) else if (src->proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&src->u.ip6, dst); #endif else eth_zero_addr(dst); } /** * batadv_mcast_mla_bridge_get - get bridged-in multicast listeners * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via * a bridge on top of the given soft interface, dev, in the given * mcast_list. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_bridge_get(struct net_device *dev, struct hlist_head *mcast_list) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct br_ip_list *br_ip_entry, *tmp; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; int ret; /* we don't need to detect these devices/listeners, the IGMP/MLD * snooping code of the Linux bridge already does that for us */ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); if (ret < 0) goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); } out: list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { list_del(&br_ip_entry->list); kfree(br_ip_entry); } return ret; } /** * batadv_mcast_mla_list_free - free a list of multicast addresses * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. */ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_retract - clean up multicast listener announcements * @bat_priv: the bat priv with all the soft interface information * @mcast_list: a list of addresses which should _not_ be removed * * Retracts the announcement of any multicast listener from the * translation table except the ones listed in the given mcast_list. * * If mcast_list is NULL then all are retracted. */ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && batadv_mcast_mla_is_duplicate(mcast_entry->addr, mcast_list)) continue; batadv_tt_local_remove(bat_priv, mcast_entry->addr, BATADV_NO_FLAGS, "mcast TT outdated", false); hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_add - add multicast listener announcements * @bat_priv: the bat priv with all the soft interface information * @mcast_list: a list of addresses which are going to get added * * Adds multicast listener announcements from the given mcast_list to the * translation table if they have not been added yet. */ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; if (!mcast_list) return; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { if (batadv_mcast_mla_is_duplicate(mcast_entry->addr, &bat_priv->mcast.mla_list)) continue; if (!batadv_tt_local_add(bat_priv->soft_iface, mcast_entry->addr, BATADV_NO_FLAGS, BATADV_NULL_IFINDEX, BATADV_NO_MARK)) continue; hlist_del(&mcast_entry->list); hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list); } } /** * batadv_mcast_has_bridge - check whether the soft-iface is bridged * @bat_priv: the bat priv with all the soft interface information * * Checks whether there is a bridge on top of our soft interface. * * Return: true if there is a bridge, false otherwise. */ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv) { struct net_device *upper = bat_priv->soft_iface; rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); rcu_read_unlock(); return upper; } /** * batadv_mcast_querier_log - debug output regarding the querier status on link * @bat_priv: the bat priv with all the soft interface information * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") * @old_state: the previous querier state on our link * @new_state: the new querier state on our link * * Outputs debug messages to the logging facility with log level 'mcast' * regarding changes to the querier status on the link which are relevant * to our multicast optimizations. * * Usually this is about whether a querier appeared or vanished in * our mesh or whether the querier is in the suboptimal position of being * behind our local bridge segment: Snooping switches will directly * forward listener reports to the querier, therefore batman-adv and * the bridge will potentially not see these listeners - the querier is * potentially shadowing listeners from us then. * * This is only interesting for nodes with a bridge on top of their * soft interface. */ static void batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, struct batadv_mcast_querier_state *old_state, struct batadv_mcast_querier_state *new_state) { if (!old_state->exists && new_state->exists) batadv_info(bat_priv->soft_iface, "%s Querier appeared\n", str_proto); else if (old_state->exists && !new_state->exists) batadv_info(bat_priv->soft_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); else if (!bat_priv->mcast.bridged && !new_state->exists) batadv_info(bat_priv->soft_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); if (new_state->exists) { if ((!old_state->shadowing && new_state->shadowing) || (!old_state->exists && new_state->shadowing)) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is behind our bridged segment: Might shadow listeners\n", str_proto); else if (old_state->shadowing && !new_state->shadowing) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is not behind our bridged segment\n", str_proto); } } /** * batadv_mcast_bridge_log - debug output for topology changes in bridged setups * @bat_priv: the bat priv with all the soft interface information * @bridged: a flag about whether the soft interface is currently bridged or not * @querier_ipv4: (maybe) new status of a potential, selected IGMP querier * @querier_ipv6: (maybe) new status of a potential, selected MLD querier * * If no bridges are ever used on this node, then this function does nothing. * * Otherwise this function outputs debug information to the 'mcast' log level * which might be relevant to our multicast optimizations. * * More precisely, it outputs information when a bridge interface is added or * removed from a soft interface. And when a bridge is present, it further * outputs information about the querier state which is relevant for the * multicast flags this node is going to set. */ static void batadv_mcast_bridge_log(struct batadv_priv *bat_priv, bool bridged, struct batadv_mcast_querier_state *querier_ipv4, struct batadv_mcast_querier_state *querier_ipv6) { if (!bat_priv->mcast.bridged && bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge added: Setting Unsnoopables(U)-flag\n"); else if (bat_priv->mcast.bridged && !bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); if (bridged) { batadv_mcast_querier_log(bat_priv, "IGMP", &bat_priv->mcast.querier_ipv4, querier_ipv4); batadv_mcast_querier_log(bat_priv, "MLD", &bat_priv->mcast.querier_ipv6, querier_ipv6); } } /** * batadv_mcast_flags_logs - output debug information about mcast flag changes * @bat_priv: the bat priv with all the soft interface information * @flags: flags indicating the new multicast state * * Whenever the multicast flags this nodes announces changes (@mcast_flags vs. * bat_priv->mcast.flags), this notifies userspace via the 'mcast' log level. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { u8 old_flags = bat_priv->mcast.flags; char str_old_flags[] = "[...]"; sprintf(str_old_flags, "[%c%c%c]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Changing multicast flags from '%s' to '[%c%c%c]'\n", bat_priv->mcast.enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); } /** * batadv_mcast_mla_tvlv_update - update multicast tvlv * @bat_priv: the bat priv with all the soft interface information * * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. * * Return: false if we want all IPv4 && IPv6 multicast traffic and true * otherwise. */ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) { struct batadv_tvlv_mcast_data mcast_data; struct batadv_mcast_querier_state querier4 = {false, false}; struct batadv_mcast_querier_state querier6 = {false, false}; struct net_device *dev = bat_priv->soft_iface; bool bridged; mcast_data.flags = BATADV_NO_FLAGS; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); bridged = batadv_mcast_has_bridge(bat_priv); if (!bridged) goto update; if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); querier6.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); querier6.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); mcast_data.flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; /* 1) If no querier exists at all, then multicast listeners on * our local TT clients behind the bridge will keep silent. * 2) If the selected querier is on one of our local TT clients, * behind the bridge, then this querier might shadow multicast * listeners on our local TT clients, behind this bridge. * * In both cases, we will signalize other batman nodes that * we need all multicast traffic of the according protocol. */ if (!querier4.exists || querier4.shadowing) mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV4; if (!querier6.exists || querier6.shadowing) mcast_data.flags |= BATADV_MCAST_WANT_ALL_IPV6; update: batadv_mcast_bridge_log(bat_priv, bridged, &querier4, &querier6); bat_priv->mcast.querier_ipv4.exists = querier4.exists; bat_priv->mcast.querier_ipv4.shadowing = querier4.shadowing; bat_priv->mcast.querier_ipv6.exists = querier6.exists; bat_priv->mcast.querier_ipv6.shadowing = querier6.shadowing; bat_priv->mcast.bridged = bridged; if (!bat_priv->mcast.enabled || mcast_data.flags != bat_priv->mcast.flags) { batadv_mcast_flags_log(bat_priv, mcast_data.flags); batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.flags = mcast_data.flags; bat_priv->mcast.enabled = true; } return !(mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV4 && mcast_data.flags & BATADV_MCAST_WANT_ALL_IPV6); } /** * __batadv_mcast_mla_update - update the own MLAs * @bat_priv: the bat priv with all the soft interface information * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are * ensured by the non-parallel execution of the worker this function * belongs to. */ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { struct net_device *soft_iface = bat_priv->soft_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; int ret; if (!batadv_mcast_mla_tvlv_update(bat_priv)) goto update; ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list); if (ret < 0) goto out; ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list); if (ret < 0) goto out; update: batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); out: batadv_mcast_mla_list_free(&mcast_list); } /** * batadv_mcast_mla_update - update the own MLAs * @work: kernel work struct * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * In the end, reschedules the work timer. */ static void batadv_mcast_mla_update(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_mcast *priv_mcast; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); spin_lock(&bat_priv->mcast.mla_lock); __batadv_mcast_mla_update(bat_priv); spin_unlock(&bat_priv->mcast.mla_lock); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_is_report_ipv4 - check for IGMP reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid IGMP report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) { if (ip_mc_check_igmp(skb, NULL) < 0) return false; switch (igmp_hdr(skb)->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv4 - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable) { struct iphdr *iphdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv4(skb)) return -EINVAL; iphdr = ip_hdr(skb); /* TODO: Implement Multicast Router Discovery (RFC4286), * then allow scope > link local, too */ if (!ipv4_is_local_multicast(iphdr->daddr)) return -EINVAL; /* link-local multicast listeners behind a bridge are * not snoopable (see RFC4541, section 2.1.2.2) */ *is_unsnoopable = true; return 0; } /** * batadv_mcast_is_report_ipv6 - check for MLD reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid MLD report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) { if (ipv6_mc_check_mld(skb, NULL) < 0) return false; switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_MGM_REPORT: case ICMPV6_MLD2_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv6 - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable) { struct ipv6hdr *ip6hdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv6(skb)) return -EINVAL; ip6hdr = ipv6_hdr(skb); /* TODO: Implement Multicast Router Discovery (RFC4286), * then allow scope > link local, too */ if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) != IPV6_ADDR_SCOPE_LINKLOCAL) return -EINVAL; /* link-local-all-nodes multicast listeners behind a bridge are * not snoopable (see RFC4541, section 3, paragraph 3) */ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) *is_unsnoopable = true; return 0; } /** * batadv_mcast_forw_mode_check - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable * * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable) { struct ethhdr *ethhdr = eth_hdr(skb); if (!atomic_read(&bat_priv->multicast_mode)) return -EINVAL; if (atomic_read(&bat_priv->mcast.num_disabled)) return -EINVAL; switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable); case ETH_P_IPV6: if (!IS_ENABLED(CONFIG_IPV6)) return -EINVAL; return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable); default: return -EINVAL; } } /** * batadv_mcast_forw_want_all_ip_count - count nodes with unspecific mcast * interest * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet * * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_ipv4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_ipv6); default: /* we shouldn't be here... */ return 0; } } /** * batadv_mcast_forw_tt_node_get - get a multicast tt node * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the ether header containing the multicast destination * * Return: an orig_node matching the multicast address provided by ethhdr * via a translation table lookup. This increases the returned nodes refcount. */ static struct batadv_orig_node * batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { return batadv_transtable_search(bat_priv, NULL, ethhdr->h_dest, BATADV_NO_FLAGS); } /** * batadv_mcast_forw_ipv4_node_get - get a node with an ipv4 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and * increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) { struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; break; } rcu_read_unlock(); return orig_node; } /** * batadv_mcast_forw_ipv6_node_get - get a node with an ipv6 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set * and increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) { struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; break; } rcu_read_unlock(); return orig_node; } /** * batadv_mcast_forw_ip_node_get - get a node with an ipv4/ipv6 flag * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and * increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_ipv4_node_get(bat_priv); case ETH_P_IPV6: return batadv_mcast_forw_ipv6_node_get(bat_priv); default: /* we shouldn't be here... */ return NULL; } } /** * batadv_mcast_forw_unsnoop_node_get - get a node with an unsnoopable flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag * set and increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) { struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_unsnoopables_list, mcast_want_all_unsnoopables_node) { if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; break; } rcu_read_unlock(); return orig_node; } /** * batadv_mcast_forw_mode - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to check * @orig: an originator to be set to forward the skb to * * Return: the forwarding mode as enum batadv_forw_mode and in case of * BATADV_FORW_SINGLE set the orig to the single originator the skb * should be forwarded to. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node **orig) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; struct ethhdr *ethhdr; ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) return BATADV_FORW_ALL; ethhdr = eth_hdr(skb); tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest, BATADV_NO_FLAGS); ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); unsnoop_count = !is_unsnoopable ? 0 : atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); total_count = tt_count + ip_count + unsnoop_count; switch (total_count) { case 1: if (tt_count) *orig = batadv_mcast_forw_tt_node_get(bat_priv, ethhdr); else if (ip_count) *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr); else if (unsnoop_count) *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv); if (*orig) return BATADV_FORW_SINGLE; /* fall through */ case 0: return BATADV_FORW_NONE; default: return BATADV_FORW_ALL; } } /** * batadv_mcast_want_unsnoop_update - update unsnoop counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, * orig, has toggled then this method updates counter and list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) { atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv4_update - update want-all-ipv4 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has * toggled then this method updates counter and list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv4_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) { atomic_dec(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv6_update - update want-all-ipv6 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has * toggled then this method updates counter and list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv6_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) { atomic_dec(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_tvlv_ogm_handler - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data * @tvlv_value_len: tvlv buffer length */ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags = BATADV_NO_FLAGS; bool orig_initialized; if (orig_mcast_enabled && tvlv_value && (tvlv_value_len >= sizeof(mcast_flags))) mcast_flags = *(u8 *)tvlv_value; spin_lock_bh(&orig->mcast_handler_lock); orig_initialized = test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); /* If mcast support is turned on decrease the disabled mcast node * counter only if we had increased it for this node before. If this * is a completely new orig_node no need to decrease the counter. */ if (orig_mcast_enabled && !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { if (orig_initialized) atomic_dec(&bat_priv->mcast.num_disabled); set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); /* If mcast support is being switched off or if this is an initial * OGM without mcast support then increase the disabled mcast * node counter. */ } else if (!orig_mcast_enabled && (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) || !orig_initialized)) { atomic_inc(&bat_priv->mcast.num_disabled); clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); } /** * batadv_mcast_init - initialize the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); batadv_mcast_start_timer(bat_priv); } #ifdef CONFIG_BATMAN_ADV_DEBUGFS /** * batadv_mcast_flags_print_header - print own mcast flags to debugfs table * @bat_priv: the bat priv with all the soft interface information * @seq: debugfs table seq_file struct * * Prints our own multicast flags including a more specific reason why * they are set, that is prints the bridge and querier state too, to * the debugfs table specified via @seq. */ static void batadv_mcast_flags_print_header(struct batadv_priv *bat_priv, struct seq_file *seq) { u8 flags = bat_priv->mcast.flags; char querier4, querier6, shadowing4, shadowing6; bool bridged = bat_priv->mcast.bridged; if (bridged) { querier4 = bat_priv->mcast.querier_ipv4.exists ? '.' : '4'; querier6 = bat_priv->mcast.querier_ipv6.exists ? '.' : '6'; shadowing4 = bat_priv->mcast.querier_ipv4.shadowing ? '4' : '.'; shadowing6 = bat_priv->mcast.querier_ipv6.shadowing ? '6' : '.'; } else { querier4 = '?'; querier6 = '?'; shadowing4 = '?'; shadowing6 = '?'; } seq_printf(seq, "Multicast flags (own flags: [%c%c%c])\n", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); seq_printf(seq, "* Bridged [U]\t\t\t\t%c\n", bridged ? 'U' : '.'); seq_printf(seq, "* No IGMP/MLD Querier [4/6]:\t\t%c/%c\n", querier4, querier6); seq_printf(seq, "* Shadowing IGMP/MLD Querier [4/6]:\t%c/%c\n", shadowing4, shadowing6); seq_puts(seq, "-------------------------------------------\n"); seq_printf(seq, " %-10s %s\n", "Originator", "Flags"); } /** * batadv_mcast_flags_seq_print_text - print the mcast flags of other nodes * @seq: seq file to print on * @offset: not used * * This prints a table of (primary) originators and their according * multicast flags, including (in the header) our own. * * Return: always 0 */ int batadv_mcast_flags_seq_print_text(struct seq_file *seq, void *offset) { struct net_device *net_dev = (struct net_device *)seq->private; struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hard_iface *primary_if; struct batadv_hashtable *hash = bat_priv->orig_hash; struct batadv_orig_node *orig_node; struct hlist_head *head; u8 flags; u32 i; primary_if = batadv_seq_print_text_primary_if_get(seq); if (!primary_if) return 0; batadv_mcast_flags_print_header(bat_priv, seq); for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, head, hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capabilities)) { seq_printf(seq, "%pM -\n", orig_node->orig); continue; } flags = orig_node->mcast_flags; seq_printf(seq, "%pM [%c%c%c]\n", orig_node->orig, (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.'); } rcu_read_unlock(); } batadv_hardif_put(primary_if); return 0; } #endif /** * batadv_mcast_free - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->mcast.work); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); /* safely calling outside of worker, as worker was canceled above */ batadv_mcast_mla_tt_retract(bat_priv, NULL); } /** * batadv_mcast_forw_send_orig() - send a multicast packet to an originator * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to send * @vid: the vlan identifier * @orig_node: the originator to send the packet to * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, struct batadv_orig_node *orig_node) { /* Avoid sending multicast-in-unicast packets to other BLA * gateways - they already got the frame from the LAN side * we share with them. * TODO: Refactor to take BLA into account earlier, to avoid * reducing the mcast_fanout count. */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) { dev_kfree_skb(skb); return NET_XMIT_SUCCESS; } return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, orig_node, vid); } /** * batadv_mcast_purge_orig - reset originator global mcast state modifications * @orig: the originator which is going to get purged */ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; spin_lock_bh(&orig->mcast_handler_lock); if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities) && test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized)) atomic_dec(&bat_priv->mcast.num_disabled); batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); spin_unlock_bh(&orig->mcast_handler_lock); }
93 93 93 93 93 5 93 91 91 91 91 91 91 91 90 90 90 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2, * or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * namei.c */ /* * This file implements code to do filename lookup in directories. * * Like inodes, directories are packed into compressed metadata blocks, stored * in a directory table. Directories are accessed using the start address of * the metablock containing the directory and the offset into the * decompressed block (<block, offset>). * * Directories are organised in a slightly complex way, and are not simply * a list of file names. The organisation takes advantage of the * fact that (in most cases) the inodes of the files will be in the same * compressed metadata block, and therefore, can share the start block. * Directories are therefore organised in a two level list, a directory * header containing the shared start block value, and a sequence of directory * entries, each of which share the shared start block. A new directory header * is written once/if the inode start block changes. The directory * header/directory entry list is repeated as many times as necessary. * * Directories are sorted, and can contain a directory index to speed up * file lookup. Directory indexes store one entry per metablock, each entry * storing the index/filename mapping to the first directory header * in each metadata block. Directories are sorted in alphabetical order, * and at lookup the index is scanned linearly looking for the first filename * alphabetically larger than the filename being looked up. At this point the * location of the metadata block the filename is in has been found. * The general idea of the index is ensure only one metadata block needs to be * decompressed to do a lookup irrespective of the length of the directory. * This scheme has the advantage that it doesn't require extra memory overhead * and doesn't require much extra storage on disk. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/dcache.h> #include <linux/xattr.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "xattr.h" /* * Lookup name in the directory index, returning the location of the metadata * block containing it, and the directory index this represents. * * If we get an error reading the index then return the part of the index * (if any) we have managed to read - the index isn't essential, just * quicker. */ static int get_dir_index_using_name(struct super_block *sb, u64 *next_block, int *next_offset, u64 index_start, int index_offset, int i_count, const char *name, int len) { struct squashfs_sb_info *msblk = sb->s_fs_info; int i, length = 0, err; unsigned int size; struct squashfs_dir_index *index; char *str; TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count); index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL); if (index == NULL) { ERROR("Failed to allocate squashfs_dir_index\n"); goto out; } str = &index->name[SQUASHFS_NAME_LEN + 1]; strncpy(str, name, len); str[len] = '\0'; for (i = 0; i < i_count; i++) { err = squashfs_read_metadata(sb, index, &index_start, &index_offset, sizeof(*index)); if (err < 0) break; size = le32_to_cpu(index->size) + 1; if (size > SQUASHFS_NAME_LEN) break; err = squashfs_read_metadata(sb, index->name, &index_start, &index_offset, size); if (err < 0) break; index->name[size] = '\0'; if (strcmp(index->name, str) > 0) break; length = le32_to_cpu(index->index); *next_block = le32_to_cpu(index->start_block) + msblk->directory_table; } *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; kfree(index); out: /* * Return index (f_pos) of the looked up metadata block. Translate * from internal f_pos to external f_pos which is offset by 3 because * we invent "." and ".." entries which are not actually stored in the * directory. */ return length + 3; } static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { const unsigned char *name = dentry->d_name.name; int len = dentry->d_name.len; struct inode *inode = NULL; struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info; struct squashfs_dir_header dirh; struct squashfs_dir_entry *dire; u64 block = squashfs_i(dir)->start + msblk->directory_table; int offset = squashfs_i(dir)->offset; int err, length; unsigned int dir_count, size; TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset); dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); if (dire == NULL) { ERROR("Failed to allocate squashfs_dir_entry\n"); return ERR_PTR(-ENOMEM); } if (len > SQUASHFS_NAME_LEN) { err = -ENAMETOOLONG; goto failed; } length = get_dir_index_using_name(dir->i_sb, &block, &offset, squashfs_i(dir)->dir_idx_start, squashfs_i(dir)->dir_idx_offset, squashfs_i(dir)->dir_idx_cnt, name, len); while (length < i_size_read(dir)) { /* * Read directory header. */ err = squashfs_read_metadata(dir->i_sb, &dirh, &block, &offset, sizeof(dirh)); if (err < 0) goto read_failure; length += sizeof(dirh); dir_count = le32_to_cpu(dirh.count) + 1; if (dir_count > SQUASHFS_DIR_COUNT) goto data_error; while (dir_count--) { /* * Read directory entry. */ err = squashfs_read_metadata(dir->i_sb, dire, &block, &offset, sizeof(*dire)); if (err < 0) goto read_failure; size = le16_to_cpu(dire->size) + 1; /* size should never be larger than SQUASHFS_NAME_LEN */ if (size > SQUASHFS_NAME_LEN) goto data_error; err = squashfs_read_metadata(dir->i_sb, dire->name, &block, &offset, size); if (err < 0) goto read_failure; length += sizeof(*dire) + size; if (name[0] < dire->name[0]) goto exit_lookup; if (len == size && !strncmp(name, dire->name, len)) { unsigned int blk, off, ino_num; long long ino; blk = le32_to_cpu(dirh.start_block); off = le16_to_cpu(dire->offset); ino_num = le32_to_cpu(dirh.inode_number) + (short) le16_to_cpu(dire->inode_number); ino = SQUASHFS_MKINODE(blk, off); TRACE("calling squashfs_iget for directory " "entry %s, inode %x:%x, %d\n", name, blk, off, ino_num); inode = squashfs_iget(dir->i_sb, ino, ino_num); goto exit_lookup; } } } exit_lookup: kfree(dire); return d_splice_alias(inode, dentry); data_error: err = -EIO; read_failure: ERROR("Unable to read directory block [%llx:%x]\n", squashfs_i(dir)->start + msblk->directory_table, squashfs_i(dir)->offset); failed: kfree(dire); return ERR_PTR(err); } const struct inode_operations squashfs_dir_inode_ops = { .lookup = squashfs_lookup, .listxattr = squashfs_listxattr };
39 39 39 39 39 38 28 39 26 24 26 27 39 39 39 39 39 39 39 39 39 5 5 5 40 39 39 1 1 40 39 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 /* * linux/fs/9p/trans_rdma.c * * RDMA transport layer based on the trans_fd.c implementation. * * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to: * Free Software Foundation * 51 Franklin Street, Fifth Floor * Boston, MA 02111-1301 USA * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/in.h> #include <linux/module.h> #include <linux/net.h> #include <linux/ipv6.h> #include <linux/kthread.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/idr.h> #include <linux/file.h> #include <linux/parser.h> #include <linux/semaphore.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #define P9_PORT 5640 #define P9_RDMA_SQ_DEPTH 32 #define P9_RDMA_RQ_DEPTH 32 #define P9_RDMA_SEND_SGE 4 #define P9_RDMA_RECV_SGE 4 #define P9_RDMA_IRD 0 #define P9_RDMA_ORD 0 #define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ #define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ /** * struct p9_trans_rdma - RDMA transport instance * * @state: tracks the transport state machine for connection setup and tear down * @cm_id: The RDMA CM ID * @pd: Protection Domain pointer * @qp: Queue Pair pointer * @cq: Completion Queue pointer * @dm_mr: DMA Memory Region pointer * @lkey: The local access only memory region key * @timeout: Number of uSecs to wait for connection management events * @privport: Whether a privileged port may be used * @port: The port to use * @sq_depth: The depth of the Send Queue * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. * @rq_sem: Semaphore for the RQ * @excess_rc : Amount of posted Receive Contexts without a pending request. * See rdma_request() * @addr: The remote peer's address * @req_lock: Protects the active request list * @cm_done: Completion event for connection management tracking */ struct p9_trans_rdma { enum { P9_RDMA_INIT, P9_RDMA_ADDR_RESOLVED, P9_RDMA_ROUTE_RESOLVED, P9_RDMA_CONNECTED, P9_RDMA_FLUSHING, P9_RDMA_CLOSING, P9_RDMA_CLOSED, } state; struct rdma_cm_id *cm_id; struct ib_pd *pd; struct ib_qp *qp; struct ib_cq *cq; long timeout; bool privport; u16 port; int sq_depth; struct semaphore sq_sem; int rq_depth; struct semaphore rq_sem; atomic_t excess_rc; struct sockaddr_in addr; spinlock_t req_lock; struct completion cm_done; }; /** * p9_rdma_context - Keeps track of in-process WR * * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ struct p9_rdma_req; struct p9_rdma_context { struct ib_cqe cqe; dma_addr_t busa; union { struct p9_req_t *req; struct p9_fcall *rc; }; }; /** * p9_rdma_opts - Collection of mount options * @port: port of connection * @sq_depth: The requested depth of the SQ. This really doesn't need * to be any deeper than the number of threads used in the client * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth * @timeout: Time to wait in msecs for CM events */ struct p9_rdma_opts { short port; bool privport; int sq_depth; int rq_depth; long timeout; }; /* * Option Parsing (code inspired by NFS code) */ enum { /* Options that take integer arguments */ Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, /* Options that take no argument */ Opt_privport, Opt_err, }; static match_table_t tokens = { {Opt_port, "port=%u"}, {Opt_sq_depth, "sq=%u"}, {Opt_rq_depth, "rq=%u"}, {Opt_timeout, "timeout=%u"}, {Opt_privport, "privport"}, {Opt_err, NULL}, }; static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) { struct p9_trans_rdma *rdma = clnt->trans; if (rdma->port != P9_PORT) seq_printf(m, ",port=%u", rdma->port); if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) seq_printf(m, ",sq=%u", rdma->sq_depth); if (rdma->rq_depth != P9_RDMA_RQ_DEPTH) seq_printf(m, ",rq=%u", rdma->rq_depth); if (rdma->timeout != P9_RDMA_TIMEOUT) seq_printf(m, ",timeout=%lu", rdma->timeout); if (rdma->privport) seq_puts(m, ",privport"); return 0; } /** * parse_opts - parse mount options into rdma options structure * @params: options string passed from mount * @opts: rdma transport-specific structure to parse options into * * Returns 0 upon success, -ERRNO upon failure */ static int parse_opts(char *params, struct p9_rdma_opts *opts) { char *p; substring_t args[MAX_OPT_ARGS]; int option; char *options, *tmp_options; opts->port = P9_PORT; opts->sq_depth = P9_RDMA_SQ_DEPTH; opts->rq_depth = P9_RDMA_RQ_DEPTH; opts->timeout = P9_RDMA_TIMEOUT; opts->privport = false; if (!params) return 0; tmp_options = kstrdup(params, GFP_KERNEL); if (!tmp_options) { p9_debug(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); return -ENOMEM; } options = tmp_options; while ((p = strsep(&options, ",")) != NULL) { int token; int r; if (!*p) continue; token = match_token(p, tokens, args); if ((token != Opt_err) && (token != Opt_privport)) { r = match_int(&args[0], &option); if (r < 0) { p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); continue; } } switch (token) { case Opt_port: opts->port = option; break; case Opt_sq_depth: opts->sq_depth = option; break; case Opt_rq_depth: opts->rq_depth = option; break; case Opt_timeout: opts->timeout = option; break; case Opt_privport: opts->privport = true; break; default: continue; } } /* RQ must be at least as large as the SQ */ opts->rq_depth = max(opts->rq_depth, opts->sq_depth); kfree(tmp_options); return 0; } static int p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { struct p9_client *c = id->context; struct p9_trans_rdma *rdma = c->trans; switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: BUG_ON(rdma->state != P9_RDMA_INIT); rdma->state = P9_RDMA_ADDR_RESOLVED; break; case RDMA_CM_EVENT_ROUTE_RESOLVED: BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); rdma->state = P9_RDMA_ROUTE_RESOLVED; break; case RDMA_CM_EVENT_ESTABLISHED: BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); rdma->state = P9_RDMA_CONNECTED; break; case RDMA_CM_EVENT_DISCONNECTED: if (rdma) rdma->state = P9_RDMA_CLOSED; c->status = Disconnected; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: break; case RDMA_CM_EVENT_ADDR_CHANGE: case RDMA_CM_EVENT_ROUTE_ERROR: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: case RDMA_CM_EVENT_REJECTED: case RDMA_CM_EVENT_CONNECT_REQUEST: case RDMA_CM_EVENT_CONNECT_RESPONSE: case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_UNREACHABLE: c->status = Disconnected; rdma_disconnect(rdma->cm_id); break; default: BUG(); } complete(&rdma->cm_done); return 0; } static void recv_done(struct ib_cq *cq, struct ib_wc *wc) { struct p9_client *client = cq->cq_context; struct p9_trans_rdma *rdma = client->trans; struct p9_rdma_context *c = container_of(wc->wr_cqe, struct p9_rdma_context, cqe); struct p9_req_t *req; int err = 0; int16_t tag; req = NULL; ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, DMA_FROM_DEVICE); if (wc->status != IB_WC_SUCCESS) goto err_out; err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); if (err) goto err_out; req = p9_tag_lookup(client, tag); if (!req) goto err_out; /* Check that we have not yet received a reply for this request. */ if (unlikely(req->rc)) { pr_err("Duplicate reply for request %d", tag); goto err_out; } req->rc = c->rc; p9_client_cb(client, req, REQ_STATUS_RCVD); out: up(&rdma->rq_sem); kfree(c); return; err_out: p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, wc->status); rdma->state = P9_RDMA_FLUSHING; client->status = Disconnected; goto out; } static void send_done(struct ib_cq *cq, struct ib_wc *wc) { struct p9_client *client = cq->cq_context; struct p9_trans_rdma *rdma = client->trans; struct p9_rdma_context *c = container_of(wc->wr_cqe, struct p9_rdma_context, cqe); ib_dma_unmap_single(rdma->cm_id->device, c->busa, c->req->tc->size, DMA_TO_DEVICE); up(&rdma->sq_sem); kfree(c); } static void qp_event_handler(struct ib_event *event, void *context) { p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n", event->event, context); } static void rdma_destroy_trans(struct p9_trans_rdma *rdma) { if (!rdma) return; if (rdma->qp && !IS_ERR(rdma->qp)) ib_destroy_qp(rdma->qp); if (rdma->pd && !IS_ERR(rdma->pd)) ib_dealloc_pd(rdma->pd); if (rdma->cq && !IS_ERR(rdma->cq)) ib_free_cq(rdma->cq); if (rdma->cm_id && !IS_ERR(rdma->cm_id)) rdma_destroy_id(rdma->cm_id); kfree(rdma); } static int post_recv(struct p9_client *client, struct p9_rdma_context *c) { struct p9_trans_rdma *rdma = client->trans; struct ib_recv_wr wr, *bad_wr; struct ib_sge sge; c->busa = ib_dma_map_single(rdma->cm_id->device, c->rc->sdata, client->msize, DMA_FROM_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; c->cqe.done = recv_done; sge.addr = c->busa; sge.length = client->msize; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; wr.wr_cqe = &c->cqe; wr.sg_list = &sge; wr.num_sge = 1; return ib_post_recv(rdma->qp, &wr, &bad_wr); error: p9_debug(P9_DEBUG_ERROR, "EIO\n"); return -EIO; } static int rdma_request(struct p9_client *client, struct p9_req_t *req) { struct p9_trans_rdma *rdma = client->trans; struct ib_send_wr wr, *bad_wr; struct ib_sge sge; int err = 0; unsigned long flags; struct p9_rdma_context *c = NULL; struct p9_rdma_context *rpl_context = NULL; /* When an error occurs between posting the recv and the send, * there will be a receive context posted without a pending request. * Since there is no way to "un-post" it, we remember it and skip * post_recv() for the next request. * So here, * see if we are this `next request' and need to absorb an excess rc. * If yes, then drop and free our own, and do not recv_post(). **/ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { /* Got one ! */ kfree(req->rc); req->rc = NULL; goto dont_need_post_recv; } else { /* We raced and lost. */ atomic_inc(&rdma->excess_rc); } } /* Allocate an fcall for the reply */ rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); if (!rpl_context) { err = -ENOMEM; goto recv_error; } rpl_context->rc = req->rc; /* * Post a receive buffer for this request. We need to ensure * there is a reply buffer available for every outstanding * request. A flushed request can result in no reply for an * outstanding request, so we must keep a count to avoid * overflowing the RQ. */ if (down_interruptible(&rdma->rq_sem)) { err = -EINTR; goto recv_error; } err = post_recv(client, rpl_context); if (err) { p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); goto recv_error; } /* remove posted receive buffer from request structure */ req->rc = NULL; dont_need_post_recv: /* Post the request */ c = kmalloc(sizeof *c, GFP_NOFS); if (!c) { err = -ENOMEM; goto send_error; } c->req = req; c->busa = ib_dma_map_single(rdma->cm_id->device, c->req->tc->sdata, c->req->tc->size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { err = -EIO; goto send_error; } c->cqe.done = send_done; sge.addr = c->busa; sge.length = c->req->tc->size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; wr.wr_cqe = &c->cqe; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; wr.sg_list = &sge; wr.num_sge = 1; if (down_interruptible(&rdma->sq_sem)) { err = -EINTR; goto send_error; } /* Mark request as `sent' *before* we actually send it, * because doing if after could erase the REQ_STATUS_RCVD * status in case of a very fast reply. */ req->status = REQ_STATUS_SENT; err = ib_post_send(rdma->qp, &wr, &bad_wr); if (err) goto send_error; /* Success */ return 0; /* Handle errors that happened during or while preparing the send: */ send_error: req->status = REQ_STATUS_ERROR; kfree(c); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); /* Ach. * We did recv_post(), but not send. We have one recv_post in excess. */ atomic_inc(&rdma->excess_rc); return err; /* Handle errors that happened during or while preparing post_recv(): */ recv_error: kfree(rpl_context); spin_lock_irqsave(&rdma->req_lock, flags); if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { rdma->state = P9_RDMA_CLOSING; spin_unlock_irqrestore(&rdma->req_lock, flags); rdma_disconnect(rdma->cm_id); } else spin_unlock_irqrestore(&rdma->req_lock, flags); return err; } static void rdma_close(struct p9_client *client) { struct p9_trans_rdma *rdma; if (!client) return; rdma = client->trans; if (!rdma) return; client->status = Disconnected; rdma_disconnect(rdma->cm_id); rdma_destroy_trans(rdma); } /** * alloc_rdma - Allocate and initialize the rdma transport structure * @opts: Mount options structure */ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) { struct p9_trans_rdma *rdma; rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); if (!rdma) return NULL; rdma->port = opts->port; rdma->privport = opts->privport; rdma->sq_depth = opts->sq_depth; rdma->rq_depth = opts->rq_depth; rdma->timeout = opts->timeout; spin_lock_init(&rdma->req_lock); init_completion(&rdma->cm_done); sema_init(&rdma->sq_sem, rdma->sq_depth); sema_init(&rdma->rq_sem, rdma->rq_depth); atomic_set(&rdma->excess_rc, 0); return rdma; } static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) { /* Nothing to do here. * We will take care of it (if we have to) in rdma_cancelled() */ return 1; } /* A request has been fully flushed without a reply. * That means we have posted one buffer in excess. */ static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) { struct p9_trans_rdma *rdma = client->trans; atomic_inc(&rdma->excess_rc); return 0; } static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) { struct sockaddr_in cl = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), }; int port, err = -EINVAL; for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) { cl.sin_port = htons((ushort)port); err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); if (err != -EADDRINUSE) break; } return err; } /** * trans_create_rdma - Transport method for creating atransport instance * @client: client instance * @addr: IP address string * @args: Mount options string */ static int rdma_create_trans(struct p9_client *client, const char *addr, char *args) { int err; struct p9_rdma_opts opts; struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; if (addr == NULL) return -EINVAL; /* Parse the transport specific mount options */ err = parse_opts(args, &opts); if (err < 0) return err; /* Create and initialize the RDMA transport structure */ rdma = alloc_rdma(&opts); if (!rdma) return -ENOMEM; /* Create the RDMA CM ID */ rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(rdma->cm_id)) goto error; /* Associate the client with the transport */ client->trans = rdma; /* Bind to a privileged port if we need to */ if (opts.privport) { err = p9_rdma_bind_privport(rdma); if (err < 0) { pr_err("%s (%d): problem binding to privport: %d\n", __func__, task_pid_nr(current), -err); goto error; } } /* Resolve the server's address */ rdma->addr.sin_family = AF_INET; rdma->addr.sin_addr.s_addr = in_aton(addr); rdma->addr.sin_port = htons(opts.port); err = rdma_resolve_addr(rdma->cm_id, NULL, (struct sockaddr *)&rdma->addr, rdma->timeout); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) goto error; /* Resolve the route to the server */ err = rdma_resolve_route(rdma->cm_id, rdma->timeout); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) goto error; /* Create the Completion Queue */ rdma->cq = ib_alloc_cq(rdma->cm_id->device, client, opts.sq_depth + opts.rq_depth + 1, 0, IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error; /* Create the Protection Domain */ rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0); if (IS_ERR(rdma->pd)) goto error; /* Create the Queue Pair */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = client; qp_attr.cap.max_send_wr = opts.sq_depth; qp_attr.cap.max_recv_wr = opts.rq_depth; qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; qp_attr.qp_type = IB_QPT_RC; qp_attr.send_cq = rdma->cq; qp_attr.recv_cq = rdma->cq; err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); if (err) goto error; rdma->qp = rdma->cm_id->qp; /* Request a connection */ memset(&conn_param, 0, sizeof(conn_param)); conn_param.private_data = NULL; conn_param.private_data_len = 0; conn_param.responder_resources = P9_RDMA_IRD; conn_param.initiator_depth = P9_RDMA_ORD; err = rdma_connect(rdma->cm_id, &conn_param); if (err) goto error; err = wait_for_completion_interruptible(&rdma->cm_done); if (err || (rdma->state != P9_RDMA_CONNECTED)) goto error; client->status = Connected; return 0; error: rdma_destroy_trans(rdma); return -ENOTCONN; } static struct p9_trans_module p9_rdma_trans = { .name = "rdma", .maxsize = P9_RDMA_MAXSIZE, .def = 0, .owner = THIS_MODULE, .create = rdma_create_trans, .close = rdma_close, .request = rdma_request, .cancel = rdma_cancel, .cancelled = rdma_cancelled, .show_options = p9_rdma_show_options, }; /** * p9_trans_rdma_init - Register the 9P RDMA transport driver */ static int __init p9_trans_rdma_init(void) { v9fs_register_trans(&p9_rdma_trans); return 0; } static void __exit p9_trans_rdma_exit(void) { v9fs_unregister_trans(&p9_rdma_trans); } module_init(p9_trans_rdma_init); module_exit(p9_trans_rdma_exit); MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); MODULE_DESCRIPTION("RDMA Transport for 9P"); MODULE_LICENSE("Dual BSD/GPL");
289 286 286 286 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 /* * Kernel-based Virtual Machine driver for Linux * * Copyright 2016 Red Hat, Inc. and/or its affiliates. * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. * */ #include <linux/kvm_host.h> #include <linux/debugfs.h> bool kvm_arch_has_vcpu_debugfs(void) { return true; } static int vcpu_get_tsc_offset(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; *val = vcpu->arch.tsc_offset; return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_offset_fops, vcpu_get_tsc_offset, NULL, "%lld\n"); static int vcpu_get_tsc_scaling_ratio(void *data, u64 *val) { struct kvm_vcpu *vcpu = (struct kvm_vcpu *) data; *val = vcpu->arch.tsc_scaling_ratio; return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_fops, vcpu_get_tsc_scaling_ratio, NULL, "%llu\n"); static int vcpu_get_tsc_scaling_frac_bits(void *data, u64 *val) { *val = kvm_tsc_scaling_ratio_frac_bits; return 0; } DEFINE_SIMPLE_ATTRIBUTE(vcpu_tsc_scaling_frac_fops, vcpu_get_tsc_scaling_frac_bits, NULL, "%llu\n"); int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu) { struct dentry *ret; ret = debugfs_create_file("tsc-offset", 0444, vcpu->debugfs_dentry, vcpu, &vcpu_tsc_offset_fops); if (!ret) return -ENOMEM; if (kvm_has_tsc_control) { ret = debugfs_create_file("tsc-scaling-ratio", 0444, vcpu->debugfs_dentry, vcpu, &vcpu_tsc_scaling_fops); if (!ret) return -ENOMEM; ret = debugfs_create_file("tsc-scaling-ratio-frac-bits", 0444, vcpu->debugfs_dentry, vcpu, &vcpu_tsc_scaling_frac_fops); if (!ret) return -ENOMEM; } return 0; }
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 /* Kernel module to match connection tracking byte counter. * GPL (C) 2002 Martin Devera (devik@cdi.cz). */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connbytes.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: Number of packets/bytes per connection matching"); MODULE_ALIAS("ipt_connbytes"); MODULE_ALIAS("ip6t_connbytes"); static bool connbytes_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; const struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int64_t what = 0; /* initialize to make gcc happy */ u_int64_t bytes = 0; u_int64_t pkts = 0; const struct nf_conn_acct *acct; const struct nf_conn_counter *counters; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; acct = nf_conn_acct_find(ct); if (!acct) return false; counters = acct->counter; switch (sinfo->what) { case XT_CONNBYTES_PKTS: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); what += atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } break; case XT_CONNBYTES_BYTES: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); break; case XT_CONNBYTES_DIR_REPLY: what = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; case XT_CONNBYTES_DIR_BOTH: what = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); what += atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); break; } break; case XT_CONNBYTES_AVGPKT: switch (sinfo->direction) { case XT_CONNBYTES_DIR_ORIGINAL: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets); break; case XT_CONNBYTES_DIR_REPLY: bytes = atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; case XT_CONNBYTES_DIR_BOTH: bytes = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].bytes) + atomic64_read(&counters[IP_CT_DIR_REPLY].bytes); pkts = atomic64_read(&counters[IP_CT_DIR_ORIGINAL].packets) + atomic64_read(&counters[IP_CT_DIR_REPLY].packets); break; } if (pkts != 0) what = div64_u64(bytes, pkts); break; } if (sinfo->count.to >= sinfo->count.from) return what <= sinfo->count.to && what >= sinfo->count.from; else /* inverted */ return what < sinfo->count.to || what > sinfo->count.from; } static int connbytes_mt_check(const struct xt_mtchk_param *par) { const struct xt_connbytes_info *sinfo = par->matchinfo; int ret; if (sinfo->what != XT_CONNBYTES_PKTS && sinfo->what != XT_CONNBYTES_BYTES && sinfo->what != XT_CONNBYTES_AVGPKT) return -EINVAL; if (sinfo->direction != XT_CONNBYTES_DIR_ORIGINAL && sinfo->direction != XT_CONNBYTES_DIR_REPLY && sinfo->direction != XT_CONNBYTES_DIR_BOTH) return -EINVAL; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", par->family); /* * This filter cannot function correctly unless connection tracking * accounting is enabled, so complain in the hope that someone notices. */ if (!nf_ct_acct_enabled(par->net)) { pr_warn("Forcing CT accounting to be enabled\n"); nf_ct_set_acct(par->net, true); } return ret; } static void connbytes_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_match connbytes_mt_reg __read_mostly = { .name = "connbytes", .revision = 0, .family = NFPROTO_UNSPEC, .checkentry = connbytes_mt_check, .match = connbytes_mt, .destroy = connbytes_mt_destroy, .matchsize = sizeof(struct xt_connbytes_info), .me = THIS_MODULE, }; static int __init connbytes_mt_init(void) { return xt_register_match(&connbytes_mt_reg); } static void __exit connbytes_mt_exit(void) { xt_unregister_match(&connbytes_mt_reg); } module_init(connbytes_mt_init); module_exit(connbytes_mt_exit);
4 4 5 5 2 1 1 2 2 1 2 3 3 1 3 3 3 3 2 3 1 1 4 1 3 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 /* * vimc-debayer.c Virtual Media Controller Driver * * Copyright (C) 2015-2017 Helen Koike <helen.fornazier@gmail.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ #include <linux/component.h> #include <linux/module.h> #include <linux/platform_device.h> #include <linux/vmalloc.h> #include <linux/v4l2-mediabus.h> #include <media/v4l2-subdev.h> #include "vimc-common.h" #define VIMC_DEB_DRV_NAME "vimc-debayer" static unsigned int deb_mean_win_size = 3; module_param(deb_mean_win_size, uint, 0000); MODULE_PARM_DESC(deb_mean_win_size, " the window size to calculate the mean.\n" "NOTE: the window size need to be an odd number, as the main pixel " "stays in the center of the window, otherwise the next odd number " "is considered"); #define IS_SINK(pad) (!pad) #define IS_SRC(pad) (pad) enum vimc_deb_rgb_colors { VIMC_DEB_RED = 0, VIMC_DEB_GREEN = 1, VIMC_DEB_BLUE = 2, }; struct vimc_deb_pix_map { u32 code; enum vimc_deb_rgb_colors order[2][2]; }; struct vimc_deb_device { struct vimc_ent_device ved; struct v4l2_subdev sd; struct device *dev; /* The active format */ struct v4l2_mbus_framefmt sink_fmt; u32 src_code; void (*set_rgb_src)(struct vimc_deb_device *vdeb, unsigned int lin, unsigned int col, unsigned int rgb[3]); /* Values calculated when the stream starts */ u8 *src_frame; const struct vimc_deb_pix_map *sink_pix_map; unsigned int sink_bpp; }; static const struct v4l2_mbus_framefmt sink_fmt_default = { .width = 640, .height = 480, .code = MEDIA_BUS_FMT_RGB888_1X24, .field = V4L2_FIELD_NONE, .colorspace = V4L2_COLORSPACE_DEFAULT, }; static const struct vimc_deb_pix_map vimc_deb_pix_map_list[] = { { .code = MEDIA_BUS_FMT_SBGGR8_1X8, .order = { { VIMC_DEB_BLUE, VIMC_DEB_GREEN }, { VIMC_DEB_GREEN, VIMC_DEB_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG8_1X8, .order = { { VIMC_DEB_GREEN, VIMC_DEB_BLUE }, { VIMC_DEB_RED, VIMC_DEB_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG8_1X8, .order = { { VIMC_DEB_GREEN, VIMC_DEB_RED }, { VIMC_DEB_BLUE, VIMC_DEB_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB8_1X8, .order = { { VIMC_DEB_RED, VIMC_DEB_GREEN }, { VIMC_DEB_GREEN, VIMC_DEB_BLUE } } }, { .code = MEDIA_BUS_FMT_SBGGR10_1X10, .order = { { VIMC_DEB_BLUE, VIMC_DEB_GREEN }, { VIMC_DEB_GREEN, VIMC_DEB_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG10_1X10, .order = { { VIMC_DEB_GREEN, VIMC_DEB_BLUE }, { VIMC_DEB_RED, VIMC_DEB_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG10_1X10, .order = { { VIMC_DEB_GREEN, VIMC_DEB_RED }, { VIMC_DEB_BLUE, VIMC_DEB_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB10_1X10, .order = { { VIMC_DEB_RED, VIMC_DEB_GREEN }, { VIMC_DEB_GREEN, VIMC_DEB_BLUE } } }, { .code = MEDIA_BUS_FMT_SBGGR12_1X12, .order = { { VIMC_DEB_BLUE, VIMC_DEB_GREEN }, { VIMC_DEB_GREEN, VIMC_DEB_RED } } }, { .code = MEDIA_BUS_FMT_SGBRG12_1X12, .order = { { VIMC_DEB_GREEN, VIMC_DEB_BLUE }, { VIMC_DEB_RED, VIMC_DEB_GREEN } } }, { .code = MEDIA_BUS_FMT_SGRBG12_1X12, .order = { { VIMC_DEB_GREEN, VIMC_DEB_RED }, { VIMC_DEB_BLUE, VIMC_DEB_GREEN } } }, { .code = MEDIA_BUS_FMT_SRGGB12_1X12, .order = { { VIMC_DEB_RED, VIMC_DEB_GREEN }, { VIMC_DEB_GREEN, VIMC_DEB_BLUE } } }, }; static const struct vimc_deb_pix_map *vimc_deb_pix_map_by_code(u32 code) { unsigned int i; for (i = 0; i < ARRAY_SIZE(vimc_deb_pix_map_list); i++) if (vimc_deb_pix_map_list[i].code == code) return &vimc_deb_pix_map_list[i]; return NULL; } static int vimc_deb_init_cfg(struct v4l2_subdev *sd, struct v4l2_subdev_pad_config *cfg) { struct vimc_deb_device *vdeb = v4l2_get_subdevdata(sd); struct v4l2_mbus_framefmt *mf; unsigned int i; mf = v4l2_subdev_get_try_format(sd, cfg, 0); *mf = sink_fmt_default; for (i = 1; i < sd->entity.num_pads; i++) { mf = v4l2_subdev_get_try_format(sd, cfg, i); *mf = sink_fmt_default; mf->code = vdeb->src_code; } return 0; } static int vimc_deb_enum_mbus_code(struct v4l2_subdev *sd, struct v4l2_subdev_pad_config *cfg, struct v4l2_subdev_mbus_code_enum *code) { /* We only support one format for source pads */ if (IS_SRC(code->pad)) { struct vimc_deb_device *vdeb = v4l2_get_subdevdata(sd); if (code->index) return -EINVAL; code->code = vdeb->src_code; } else { if (code->index >= ARRAY_SIZE(vimc_deb_pix_map_list)) return -EINVAL; code->code = vimc_deb_pix_map_list[code->index].code; } return 0; } static int vimc_deb_enum_frame_size(struct v4l2_subdev *sd, struct v4l2_subdev_pad_config *cfg, struct v4l2_subdev_frame_size_enum *fse) { struct vimc_deb_device *vdeb = v4l2_get_subdevdata(sd); if (fse->index) return -EINVAL; if (IS_SINK(fse->pad)) { const struct vimc_deb_pix_map *vpix = vimc_deb_pix_map_by_code(fse->code); if (!vpix) return -EINVAL; } else if (fse->code != vdeb->src_code) { return -EINVAL; } fse->min_width = VIMC_FRAME_MIN_WIDTH; fse->max_width = VIMC_FRAME_MAX_WIDTH; fse->min_height = VIMC_FRAME_MIN_HEIGHT; fse->max_height = VIMC_FRAME_MAX_HEIGHT; return 0; } static int vimc_deb_get_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_pad_config *cfg, struct v4l2_subdev_format *fmt) { struct vimc_deb_device *vdeb = v4l2_get_subdevdata(sd); /* Get the current sink format */ fmt->format = fmt->which == V4L2_SUBDEV_FORMAT_TRY ? *v4l2_subdev_get_try_format(sd, cfg, 0) : vdeb->sink_fmt; /* Set the right code for the source pad */ if (IS_SRC(fmt->pad)) fmt->format.code = vdeb->src_code; return 0; } static void vimc_deb_adjust_sink_fmt(struct v4l2_mbus_framefmt *fmt) { const struct vimc_deb_pix_map *vpix; /* Don't accept a code that is not on the debayer table */ vpix = vimc_deb_pix_map_by_code(fmt->code); if (!vpix) fmt->code = sink_fmt_default.code; fmt->width = clamp_t(u32, fmt->width, VIMC_FRAME_MIN_WIDTH, VIMC_FRAME_MAX_WIDTH) & ~1; fmt->height = clamp_t(u32, fmt->height, VIMC_FRAME_MIN_HEIGHT, VIMC_FRAME_MAX_HEIGHT) & ~1; if (fmt->field == V4L2_FIELD_ANY) fmt->field = sink_fmt_default.field; vimc_colorimetry_clamp(fmt); } static int vimc_deb_set_fmt(struct v4l2_subdev *sd, struct v4l2_subdev_pad_config *cfg, struct v4l2_subdev_format *fmt) { struct vimc_deb_device *vdeb = v4l2_get_subdevdata(sd); struct v4l2_mbus_framefmt *sink_fmt; if (fmt->which == V4L2_SUBDEV_FORMAT_ACTIVE) { /* Do not change the format while stream is on */ if (vdeb->src_frame) return -EBUSY; sink_fmt = &vdeb->sink_fmt; } else { sink_fmt = v4l2_subdev_get_try_format(sd, cfg, 0); } /* * Do not change the format of the source pad, * it is propagated from the sink */ if (IS_SRC(fmt->pad)) { fmt->format = *sink_fmt; /* TODO: Add support for other formats */ fmt->format.code = vdeb->src_code; } else { /* Set the new format in the sink pad */ vimc_deb_adjust_sink_fmt(&fmt->format); dev_dbg(vdeb->dev, "%s: sink format update: " "old:%dx%d (0x%x, %d, %d, %d, %d) " "new:%dx%d (0x%x, %d, %d, %d, %d)\n", vdeb->sd.name, /* old */ sink_fmt->width, sink_fmt->height, sink_fmt->code, sink_fmt->colorspace, sink_fmt->quantization, sink_fmt->xfer_func, sink_fmt->ycbcr_enc, /* new */ fmt->format.width, fmt->format.height, fmt->format.code, fmt->format.colorspace, fmt->format.quantization, fmt->format.xfer_func, fmt->format.ycbcr_enc); *sink_fmt = fmt->format; } return 0; } static const struct v4l2_subdev_pad_ops vimc_deb_pad_ops = { .init_cfg = vimc_deb_init_cfg, .enum_mbus_code = vimc_deb_enum_mbus_code, .enum_frame_size = vimc_deb_enum_frame_size, .get_fmt = vimc_deb_get_fmt, .set_fmt = vimc_deb_set_fmt, }; static void vimc_deb_set_rgb_mbus_fmt_rgb888_1x24(struct vimc_deb_device *vdeb, unsigned int lin, unsigned int col, unsigned int rgb[3]) { unsigned int i, index; index = VIMC_FRAME_INDEX(lin, col, vdeb->sink_fmt.width, 3); for (i = 0; i < 3; i++) vdeb->src_frame[index + i] = rgb[i]; } static int vimc_deb_s_stream(struct v4l2_subdev *sd, int enable) { struct vimc_deb_device *vdeb = v4l2_get_subdevdata(sd); if (enable) { const struct vimc_pix_map *vpix; unsigned int frame_size; if (vdeb->src_frame) return 0; /* Calculate the frame size of the source pad */ vpix = vimc_pix_map_by_code(vdeb->src_code); frame_size = vdeb->sink_fmt.width * vdeb->sink_fmt.height * vpix->bpp; /* Save the bytes per pixel of the sink */ vpix = vimc_pix_map_by_code(vdeb->sink_fmt.code); vdeb->sink_bpp = vpix->bpp; /* Get the corresponding pixel map from the table */ vdeb->sink_pix_map = vimc_deb_pix_map_by_code(vdeb->sink_fmt.code); /* * Allocate the frame buffer. Use vmalloc to be able to * allocate a large amount of memory */ vdeb->src_frame = vmalloc(frame_size); if (!vdeb->src_frame) return -ENOMEM; } else { if (!vdeb->src_frame) return 0; vfree(vdeb->src_frame); vdeb->src_frame = NULL; } return 0; } static const struct v4l2_subdev_video_ops vimc_deb_video_ops = { .s_stream = vimc_deb_s_stream, }; static const struct v4l2_subdev_ops vimc_deb_ops = { .pad = &vimc_deb_pad_ops, .video = &vimc_deb_video_ops, }; static unsigned int vimc_deb_get_val(const u8 *bytes, const unsigned int n_bytes) { unsigned int i; unsigned int acc = 0; for (i = 0; i < n_bytes; i++) acc = acc + (bytes[i] << (8 * i)); return acc; } static void vimc_deb_calc_rgb_sink(struct vimc_deb_device *vdeb, const u8 *frame, const unsigned int lin, const unsigned int col, unsigned int rgb[3]) { unsigned int i, seek, wlin, wcol; unsigned int n_rgb[3] = {0, 0, 0}; for (i = 0; i < 3; i++) rgb[i] = 0; /* * Calculate how many we need to subtract to get to the pixel in * the top left corner of the mean window (considering the current * pixel as the center) */ seek = deb_mean_win_size / 2; /* Sum the values of the colors in the mean window */ dev_dbg(vdeb->dev, "deb: %s: --- Calc pixel %dx%d, window mean %d, seek %d ---\n", vdeb->sd.name, lin, col, vdeb->sink_fmt.height, seek); /* * Iterate through all the lines in the mean window, start * with zero if the pixel is outside the frame and don't pass * the height when the pixel is in the bottom border of the * frame */ for (wlin = seek > lin ? 0 : lin - seek; wlin < lin + seek + 1 && wlin < vdeb->sink_fmt.height; wlin++) { /* * Iterate through all the columns in the mean window, start * with zero if the pixel is outside the frame and don't pass * the width when the pixel is in the right border of the * frame */ for (wcol = seek > col ? 0 : col - seek; wcol < col + seek + 1 && wcol < vdeb->sink_fmt.width; wcol++) { enum vimc_deb_rgb_colors color; unsigned int index; /* Check which color this pixel is */ color = vdeb->sink_pix_map->order[wlin % 2][wcol % 2]; index = VIMC_FRAME_INDEX(wlin, wcol, vdeb->sink_fmt.width, vdeb->sink_bpp); dev_dbg(vdeb->dev, "deb: %s: RGB CALC: frame index %d, win pos %dx%d, color %d\n", vdeb->sd.name, index, wlin, wcol, color); /* Get its value */ rgb[color] = rgb[color] + vimc_deb_get_val(&frame[index], vdeb->sink_bpp); /* Save how many values we already added */ n_rgb[color]++; dev_dbg(vdeb->dev, "deb: %s: RGB CALC: val %d, n %d\n", vdeb->sd.name, rgb[color], n_rgb[color]); } } /* Calculate the mean */ for (i = 0; i < 3; i++) { dev_dbg(vdeb->dev, "deb: %s: PRE CALC: %dx%d Color %d, val %d, n %d\n", vdeb->sd.name, lin, col, i, rgb[i], n_rgb[i]); if (n_rgb[i]) rgb[i] = rgb[i] / n_rgb[i]; dev_dbg(vdeb->dev, "deb: %s: FINAL CALC: %dx%d Color %d, val %d\n", vdeb->sd.name, lin, col, i, rgb[i]); } } static void *vimc_deb_process_frame(struct vimc_ent_device *ved, const void *sink_frame) { struct vimc_deb_device *vdeb = container_of(ved, struct vimc_deb_device, ved); unsigned int rgb[3]; unsigned int i, j; /* If the stream in this node is not active, just return */ if (!vdeb->src_frame) return ERR_PTR(-EINVAL); for (i = 0; i < vdeb->sink_fmt.height; i++) for (j = 0; j < vdeb->sink_fmt.width; j++) { vimc_deb_calc_rgb_sink(vdeb, sink_frame, i, j, rgb); vdeb->set_rgb_src(vdeb, i, j, rgb); } return vdeb->src_frame; } static void vimc_deb_comp_unbind(struct device *comp, struct device *master, void *master_data) { struct vimc_ent_device *ved = dev_get_drvdata(comp); struct vimc_deb_device *vdeb = container_of(ved, struct vimc_deb_device, ved); vimc_ent_sd_unregister(ved, &vdeb->sd); kfree(vdeb); } static int vimc_deb_comp_bind(struct device *comp, struct device *master, void *master_data) { struct v4l2_device *v4l2_dev = master_data; struct vimc_platform_data *pdata = comp->platform_data; struct vimc_deb_device *vdeb; int ret; /* Allocate the vdeb struct */ vdeb = kzalloc(sizeof(*vdeb), GFP_KERNEL); if (!vdeb) return -ENOMEM; /* Initialize ved and sd */ ret = vimc_ent_sd_register(&vdeb->ved, &vdeb->sd, v4l2_dev, pdata->entity_name, MEDIA_ENT_F_ATV_DECODER, 2, (const unsigned long[2]) {MEDIA_PAD_FL_SINK, MEDIA_PAD_FL_SOURCE}, &vimc_deb_ops); if (ret) { kfree(vdeb); return ret; } vdeb->ved.process_frame = vimc_deb_process_frame; dev_set_drvdata(comp, &vdeb->ved); vdeb->dev = comp; /* Initialize the frame format */ vdeb->sink_fmt = sink_fmt_default; /* * TODO: Add support for more output formats, we only support * RGB888 for now * NOTE: the src format is always the same as the sink, except * for the code */ vdeb->src_code = MEDIA_BUS_FMT_RGB888_1X24; vdeb->set_rgb_src = vimc_deb_set_rgb_mbus_fmt_rgb888_1x24; return 0; } static const struct component_ops vimc_deb_comp_ops = { .bind = vimc_deb_comp_bind, .unbind = vimc_deb_comp_unbind, }; static int vimc_deb_probe(struct platform_device *pdev) { return component_add(&pdev->dev, &vimc_deb_comp_ops); } static int vimc_deb_remove(struct platform_device *pdev) { component_del(&pdev->dev, &vimc_deb_comp_ops); return 0; } static const struct platform_device_id vimc_deb_driver_ids[] = { { .name = VIMC_DEB_DRV_NAME, }, { } }; static struct platform_driver vimc_deb_pdrv = { .probe = vimc_deb_probe, .remove = vimc_deb_remove, .id_table = vimc_deb_driver_ids, .driver = { .name = VIMC_DEB_DRV_NAME, }, }; module_platform_driver(vimc_deb_pdrv); MODULE_DEVICE_TABLE(platform, vimc_deb_driver_ids); MODULE_DESCRIPTION("Virtual Media Controller Driver (VIMC) Debayer"); MODULE_AUTHOR("Helen Mae Koike Fornazier <helen.fornazier@gmail.com>"); MODULE_LICENSE("GPL");
7 4 6 5 5 5 5 9 5 5 1 5 1 5 4 3 2 3 2 2 2 2 2 3 5 5 5 4 4 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 /* * TCP Veno congestion control * * This is based on the congestion detection/avoidance scheme described in * C. P. Fu, S. C. Liew. * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." * IEEE Journal on Selected Areas in Communication, * Feb. 2003. * See http://www.ie.cuhk.edu.hk/fileadmin/staff_upload/soung/Journal/J3.pdf */ #include <linux/mm.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/inet_diag.h> #include <net/tcp.h> /* Default values of the Veno variables, in fixed-point representation * with V_PARAM_SHIFT bits to the right of the binary point. */ #define V_PARAM_SHIFT 1 static const int beta = 3 << V_PARAM_SHIFT; /* Veno variables */ struct veno { u8 doing_veno_now; /* if true, do veno for this rtt */ u16 cntrtt; /* # of rtts measured within last rtt */ u32 minrtt; /* min of rtts measured within last rtt (in usec) */ u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ u32 inc; /* decide whether to increase cwnd */ u32 diff; /* calculate the diff rate */ }; /* There are several situations when we must "re-start" Veno: * * o when a connection is established * o after an RTO * o after fast recovery * o when we send a packet and there is no outstanding * unacknowledged data (restarting an idle connection) * */ static inline void veno_enable(struct sock *sk) { struct veno *veno = inet_csk_ca(sk); /* turn on Veno */ veno->doing_veno_now = 1; veno->minrtt = 0x7fffffff; } static inline void veno_disable(struct sock *sk) { struct veno *veno = inet_csk_ca(sk); /* turn off Veno */ veno->doing_veno_now = 0; } static void tcp_veno_init(struct sock *sk) { struct veno *veno = inet_csk_ca(sk); veno->basertt = 0x7fffffff; veno->inc = 1; veno_enable(sk); } /* Do rtt sampling needed for Veno. */ static void tcp_veno_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct veno *veno = inet_csk_ca(sk); u32 vrtt; if (sample->rtt_us < 0) return; /* Never allow zero rtt or baseRTT */ vrtt = sample->rtt_us + 1; /* Filter to find propagation delay: */ if (vrtt < veno->basertt) veno->basertt = vrtt; /* Find the min rtt during the last rtt to find * the current prop. delay + queuing delay: */ veno->minrtt = min(veno->minrtt, vrtt); veno->cntrtt++; } static void tcp_veno_state(struct sock *sk, u8 ca_state) { if (ca_state == TCP_CA_Open) veno_enable(sk); else veno_disable(sk); } /* * If the connection is idle and we are restarting, * then we don't want to do any Veno calculations * until we get fresh rtt samples. So when we * restart, we reset our Veno state to a clean * state. After we get acks for this flight of * packets, _then_ we can make Veno calculations * again. */ static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) { if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) tcp_veno_init(sk); } static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (!veno->doing_veno_now) { tcp_reno_cong_avoid(sk, ack, acked); return; } /* limited by applications */ if (!tcp_is_cwnd_limited(sk)) return; /* We do the Veno calculations only if we got enough rtt samples */ if (veno->cntrtt <= 2) { /* We don't have enough rtt samples to do the Veno * calculation, so we'll behave like Reno. */ tcp_reno_cong_avoid(sk, ack, acked); } else { u64 target_cwnd; u32 rtt; /* We have enough rtt samples, so, using the Veno * algorithm, we determine the state of the network. */ rtt = veno->minrtt; target_cwnd = (u64)tp->snd_cwnd * veno->basertt; target_cwnd <<= V_PARAM_SHIFT; do_div(target_cwnd, rtt); veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; if (tcp_in_slow_start(tp)) { /* Slow start. */ tcp_slow_start(tp, acked); } else { /* Congestion avoidance. */ if (veno->diff < beta) { /* In the "non-congestive state", increase cwnd * every rtt. */ tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1); } else { /* In the "congestive state", increase cwnd * every other rtt. */ if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { if (veno->inc && tp->snd_cwnd < tp->snd_cwnd_clamp) { tp->snd_cwnd++; veno->inc = 0; } else veno->inc = 1; tp->snd_cwnd_cnt = 0; } else tp->snd_cwnd_cnt++; } } if (tp->snd_cwnd < 2) tp->snd_cwnd = 2; else if (tp->snd_cwnd > tp->snd_cwnd_clamp) tp->snd_cwnd = tp->snd_cwnd_clamp; } /* Wipe the slate clean for the next rtt. */ /* veno->cntrtt = 0; */ veno->minrtt = 0x7fffffff; } /* Veno MD phase */ static u32 tcp_veno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct veno *veno = inet_csk_ca(sk); if (veno->diff < beta) /* in "non-congestive state", cut cwnd by 1/5 */ return max(tp->snd_cwnd * 4 / 5, 2U); else /* in "congestive state", cut cwnd by 1/2 */ return max(tp->snd_cwnd >> 1U, 2U); } static struct tcp_congestion_ops tcp_veno __read_mostly = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_veno_cong_avoid, .pkts_acked = tcp_veno_pkts_acked, .set_state = tcp_veno_state, .cwnd_event = tcp_veno_cwnd_event, .owner = THIS_MODULE, .name = "veno", }; static int __init tcp_veno_register(void) { BUILD_BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); tcp_register_congestion_control(&tcp_veno); return 0; } static void __exit tcp_veno_unregister(void) { tcp_unregister_congestion_control(&tcp_veno); } module_init(tcp_veno_register); module_exit(tcp_veno_unregister); MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("TCP Veno");
23 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_SYNPROXY_H #define _NF_CONNTRACK_SYNPROXY_H #include <net/netns/generic.h> struct nf_conn_synproxy { u32 isn; u32 its; u32 tsoff; }; static inline struct nf_conn_synproxy *nfct_synproxy(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) return nf_ct_ext_find(ct, NF_CT_EXT_SYNPROXY); #else return NULL; #endif } static inline struct nf_conn_synproxy *nfct_synproxy_ext_add(struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) return nf_ct_ext_add(ct, NF_CT_EXT_SYNPROXY, GFP_ATOMIC); #else return NULL; #endif } static inline bool nf_ct_add_synproxy(struct nf_conn *ct, const struct nf_conn *tmpl) { if (tmpl && nfct_synproxy(tmpl)) { if (!nfct_seqadj_ext_add(ct)) return false; if (!nfct_synproxy_ext_add(ct)) return false; } return true; } struct synproxy_stats { unsigned int syn_received; unsigned int cookie_invalid; unsigned int cookie_valid; unsigned int cookie_retrans; unsigned int conn_reopened; }; struct synproxy_net { struct nf_conn *tmpl; struct synproxy_stats __percpu *stats; unsigned int hook_ref4; unsigned int hook_ref6; }; extern unsigned int synproxy_net_id; static inline struct synproxy_net *synproxy_pernet(struct net *net) { return net_generic(net, synproxy_net_id); } struct synproxy_options { u8 options; u8 wscale; u16 mss; u32 tsval; u32 tsecr; }; struct tcphdr; struct xt_synproxy_info; bool synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, const struct tcphdr *th, struct synproxy_options *opts); unsigned int synproxy_options_size(const struct synproxy_options *opts); void synproxy_build_options(struct tcphdr *th, const struct synproxy_options *opts); void synproxy_init_timestamp_cookie(const struct xt_synproxy_info *info, struct synproxy_options *opts); void synproxy_check_timestamp_cookie(struct synproxy_options *opts); unsigned int synproxy_tstamp_adjust(struct sk_buff *skb, unsigned int protoff, struct tcphdr *th, struct nf_conn *ct, enum ip_conntrack_info ctinfo, const struct nf_conn_synproxy *synproxy); #endif /* _NF_CONNTRACK_SYNPROXY_H */
1823 1822 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* * Copyright (c) 2007-2012 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA */ #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/netns/generic.h> #include "datapath.h" #include "vport-internal_dev.h" #include "vport-netdev.h" static void dp_detach_port_notify(struct vport *vport) { struct sk_buff *notify; struct datapath *dp; dp = vport->dp; notify = ovs_vport_cmd_build_info(vport, 0, 0, OVS_VPORT_CMD_DEL); ovs_dp_detach_port(vport); if (IS_ERR(notify)) { genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, 0, PTR_ERR(notify)); return; } genlmsg_multicast_netns(&dp_vport_genl_family, ovs_dp_get_net(dp), notify, 0, 0, GFP_KERNEL); } void ovs_dp_notify_wq(struct work_struct *work) { struct ovs_net *ovs_net = container_of(work, struct ovs_net, dp_notify_work); struct datapath *dp; ovs_lock(); list_for_each_entry(dp, &ovs_net->dps, list_node) { int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH)) dp_detach_port_notify(vport); } } } ovs_unlock(); } static int dp_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { struct ovs_net *ovs_net; struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct vport *vport = NULL; if (!ovs_is_internal_dev(dev)) vport = ovs_netdev_get_vport(dev); if (!vport) return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { /* upper_dev_unlink and decrement promisc immediately */ ovs_netdev_detach_dev(vport); /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); queue_work(system_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; } struct notifier_block ovs_dp_device_notifier = { .notifier_call = dp_device_event };
101 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 /* * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS * project. * * Copyright (c) 2001-2005 Anton Altaparmakov * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program/include file is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program (in the main directory of the Linux-NTFS * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _LINUX_NTFS_LAYOUT_H #define _LINUX_NTFS_LAYOUT_H #include <linux/types.h> #include <linux/bitops.h> #include <linux/list.h> #include <asm/byteorder.h> #include "types.h" /* The NTFS oem_id "NTFS " */ #define magicNTFS cpu_to_le64(0x202020205346544eULL) /* * Location of bootsector on partition: * The standard NTFS_BOOT_SECTOR is on sector 0 of the partition. * On NT4 and above there is one backup copy of the boot sector to * be found on the last sector of the partition (not normally accessible * from within Windows as the bootsector contained number of sectors * value is one less than the actual value!). * On versions of NT 3.51 and earlier, the backup copy was located at * number of sectors/2 (integer divide), i.e. in the middle of the volume. */ /* * BIOS parameter block (bpb) structure. */ typedef struct { le16 bytes_per_sector; /* Size of a sector in bytes. */ u8 sectors_per_cluster; /* Size of a cluster in sectors. */ le16 reserved_sectors; /* zero */ u8 fats; /* zero */ le16 root_entries; /* zero */ le16 sectors; /* zero */ u8 media_type; /* 0xf8 = hard disk */ le16 sectors_per_fat; /* zero */ le16 sectors_per_track; /* irrelevant */ le16 heads; /* irrelevant */ le32 hidden_sectors; /* zero */ le32 large_sectors; /* zero */ } __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK; /* * NTFS boot sector structure. */ typedef struct { u8 jump[3]; /* Irrelevant (jump to boot up code).*/ le64 oem_id; /* Magic "NTFS ". */ BIOS_PARAMETER_BLOCK bpb; /* See BIOS_PARAMETER_BLOCK. */ u8 unused[4]; /* zero, NTFS diskedit.exe states that this is actually: __u8 physical_drive; // 0x80 __u8 current_head; // zero __u8 extended_boot_signature; // 0x80 __u8 unused; // zero */ /*0x28*/sle64 number_of_sectors; /* Number of sectors in volume. Gives maximum volume size of 2^63 sectors. Assuming standard sector size of 512 bytes, the maximum byte size is approx. 4.7x10^21 bytes. (-; */ sle64 mft_lcn; /* Cluster location of mft data. */ sle64 mftmirr_lcn; /* Cluster location of copy of mft. */ s8 clusters_per_mft_record; /* Mft record size in clusters. */ u8 reserved0[3]; /* zero */ s8 clusters_per_index_record; /* Index block size in clusters. */ u8 reserved1[3]; /* zero */ le64 volume_serial_number; /* Irrelevant (serial number). */ le32 checksum; /* Boot sector checksum. */ /*0x54*/u8 bootstrap[426]; /* Irrelevant (boot up code). */ le16 end_of_sector_marker; /* End of bootsector magic. Always is 0xaa55 in little endian. */ /* sizeof() = 512 (0x200) bytes */ } __attribute__ ((__packed__)) NTFS_BOOT_SECTOR; /* * Magic identifiers present at the beginning of all ntfs record containing * records (like mft records for example). */ enum { /* Found in $MFT/$DATA. */ magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */ magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */ magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */ /* Found in $LogFile/$DATA. */ magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */ magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */ /* Found in $LogFile/$DATA. (May be found in $MFT/$DATA, also?) */ magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */ /* Found in all ntfs record containing records. */ magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector transfer was detected. */ /* * Found in $LogFile/$DATA when a page is full of 0xff bytes and is * thus not initialized. Page must be initialized before using it. */ magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */ }; typedef le32 NTFS_RECORD_TYPE; /* * Generic magic comparison macros. Finally found a use for the ## preprocessor * operator! (-8 */ static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r) { return (x == r); } #define ntfs_is_magic(x, m) __ntfs_is_magic(x, magic_##m) static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r) { return (*p == r); } #define ntfs_is_magicp(p, m) __ntfs_is_magicp(p, magic_##m) /* * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above. */ #define ntfs_is_file_record(x) ( ntfs_is_magic (x, FILE) ) #define ntfs_is_file_recordp(p) ( ntfs_is_magicp(p, FILE) ) #define ntfs_is_mft_record(x) ( ntfs_is_file_record (x) ) #define ntfs_is_mft_recordp(p) ( ntfs_is_file_recordp(p) ) #define ntfs_is_indx_record(x) ( ntfs_is_magic (x, INDX) ) #define ntfs_is_indx_recordp(p) ( ntfs_is_magicp(p, INDX) ) #define ntfs_is_hole_record(x) ( ntfs_is_magic (x, HOLE) ) #define ntfs_is_hole_recordp(p) ( ntfs_is_magicp(p, HOLE) ) #define ntfs_is_rstr_record(x) ( ntfs_is_magic (x, RSTR) ) #define ntfs_is_rstr_recordp(p) ( ntfs_is_magicp(p, RSTR) ) #define ntfs_is_rcrd_record(x) ( ntfs_is_magic (x, RCRD) ) #define ntfs_is_rcrd_recordp(p) ( ntfs_is_magicp(p, RCRD) ) #define ntfs_is_chkd_record(x) ( ntfs_is_magic (x, CHKD) ) #define ntfs_is_chkd_recordp(p) ( ntfs_is_magicp(p, CHKD) ) #define ntfs_is_baad_record(x) ( ntfs_is_magic (x, BAAD) ) #define ntfs_is_baad_recordp(p) ( ntfs_is_magicp(p, BAAD) ) #define ntfs_is_empty_record(x) ( ntfs_is_magic (x, empty) ) #define ntfs_is_empty_recordp(p) ( ntfs_is_magicp(p, empty) ) /* * The Update Sequence Array (usa) is an array of the le16 values which belong * to the end of each sector protected by the update sequence record in which * this array is contained. Note that the first entry is the Update Sequence * Number (usn), a cyclic counter of how many times the protected record has * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All * last le16's of each sector have to be equal to the usn (during reading) or * are set to it (during writing). If they are not, an incomplete multi sector * transfer has occurred when the data was written. * The maximum size for the update sequence array is fixed to: * maximum size = usa_ofs + (usa_count * 2) = 510 bytes * The 510 bytes comes from the fact that the last le16 in the array has to * (obviously) finish before the last le16 of the first 512-byte sector. * This formula can be used as a consistency check in that usa_ofs + * (usa_count * 2) has to be less than or equal to 510. */ typedef struct { NTFS_RECORD_TYPE magic; /* A four-byte magic identifying the record type and/or status. */ le16 usa_ofs; /* Offset to the Update Sequence Array (usa) from the start of the ntfs record. */ le16 usa_count; /* Number of le16 sized entries in the usa including the Update Sequence Number (usn), thus the number of fixups is the usa_count minus 1. */ } __attribute__ ((__packed__)) NTFS_RECORD; /* * System files mft record numbers. All these files are always marked as used * in the bitmap attribute of the mft; presumably in order to avoid accidental * allocation for random other mft records. Also, the sequence number for each * of the system files is always equal to their mft record number and it is * never modified. */ typedef enum { FILE_MFT = 0, /* Master file table (mft). Data attribute contains the entries and bitmap attribute records which ones are in use (bit==1). */ FILE_MFTMirr = 1, /* Mft mirror: copy of first four mft records in data attribute. If cluster size > 4kiB, copy of first N mft records, with N = cluster_size / mft_record_size. */ FILE_LogFile = 2, /* Journalling log in data attribute. */ FILE_Volume = 3, /* Volume name attribute and volume information attribute (flags and ntfs version). Windows refers to this file as volume DASD (Direct Access Storage Device). */ FILE_AttrDef = 4, /* Array of attribute definitions in data attribute. */ FILE_root = 5, /* Root directory. */ FILE_Bitmap = 6, /* Allocation bitmap of all clusters (lcns) in data attribute. */ FILE_Boot = 7, /* Boot sector (always at cluster 0) in data attribute. */ FILE_BadClus = 8, /* Contains all bad clusters in the non-resident data attribute. */ FILE_Secure = 9, /* Shared security descriptors in data attribute and two indexes into the descriptors. Appeared in Windows 2000. Before that, this file was named $Quota but was unused. */ FILE_UpCase = 10, /* Uppercase equivalents of all 65536 Unicode characters in data attribute. */ FILE_Extend = 11, /* Directory containing other system files (eg. $ObjId, $Quota, $Reparse and $UsnJrnl). This is new to NTFS3.0. */ FILE_reserved12 = 12, /* Reserved for future use (records 12-15). */ FILE_reserved13 = 13, FILE_reserved14 = 14, FILE_reserved15 = 15, FILE_first_user = 16, /* First user file, used as test limit for whether to allow opening a file or not. */ } NTFS_SYSTEM_FILES; /* * These are the so far known MFT_RECORD_* flags (16-bit) which contain * information about the mft record in which they are present. */ enum { MFT_RECORD_IN_USE = cpu_to_le16(0x0001), MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002), } __attribute__ ((__packed__)); typedef le16 MFT_RECORD_FLAGS; /* * mft references (aka file references or file record segment references) are * used whenever a structure needs to refer to a record in the mft. * * A reference consists of a 48-bit index into the mft and a 16-bit sequence * number used to detect stale references. * * For error reporting purposes we treat the 48-bit index as a signed quantity. * * The sequence number is a circular counter (skipping 0) describing how many * times the referenced mft record has been (re)used. This has to match the * sequence number of the mft record being referenced, otherwise the reference * is considered stale and removed (FIXME: only ntfsck or the driver itself?). * * If the sequence number is zero it is assumed that no sequence number * consistency checking should be performed. * * FIXME: Since inodes are 32-bit as of now, the driver needs to always check * for high_part being 0 and if not either BUG(), cause a panic() or handle * the situation in some other way. This shouldn't be a problem as a volume has * to become HUGE in order to need more than 32-bits worth of mft records. * Assuming the standard mft record size of 1kb only the records (never mind * the non-resident attributes, etc.) would require 4Tb of space on their own * for the first 32 bits worth of records. This is only if some strange person * doesn't decide to foul play and make the mft sparse which would be a really * horrible thing to do as it would trash our current driver implementation. )-: * Do I hear screams "we want 64-bit inodes!" ?!? (-; * * FIXME: The mft zone is defined as the first 12% of the volume. This space is * reserved so that the mft can grow contiguously and hence doesn't become * fragmented. Volume free space includes the empty part of the mft zone and * when the volume's free 88% are used up, the mft zone is shrunk by a factor * of 2, thus making more space available for more files/data. This process is * repeated every time there is no more free space except for the mft zone until * there really is no more free space. */ /* * Typedef the MFT_REF as a 64-bit value for easier handling. * Also define two unpacking macros to get to the reference (MREF) and * sequence number (MSEQNO) respectively. * The _LE versions are to be applied on little endian MFT_REFs. * Note: The _LE versions will return a CPU endian formatted value! */ #define MFT_REF_MASK_CPU 0x0000ffffffffffffULL #define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU) typedef u64 MFT_REF; typedef le64 leMFT_REF; #define MK_MREF(m, s) ((MFT_REF)(((MFT_REF)(s) << 48) | \ ((MFT_REF)(m) & MFT_REF_MASK_CPU))) #define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s)) #define MREF(x) ((unsigned long)((x) & MFT_REF_MASK_CPU)) #define MSEQNO(x) ((u16)(((x) >> 48) & 0xffff)) #define MREF_LE(x) ((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU)) #define MSEQNO_LE(x) ((u16)((le64_to_cpu(x) >> 48) & 0xffff)) #define IS_ERR_MREF(x) (((x) & 0x0000800000000000ULL) ? true : false) #define ERR_MREF(x) ((u64)((s64)(x))) #define MREF_ERR(x) ((int)((s64)(x))) /* * The mft record header present at the beginning of every record in the mft. * This is followed by a sequence of variable length attribute records which * is terminated by an attribute of type AT_END which is a truncated attribute * in that it only consists of the attribute type code AT_END and none of the * other members of the attribute structure are present. */ typedef struct { /*Ofs*/ /* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ le16 usa_ofs; /* See NTFS_RECORD definition above. */ le16 usa_count; /* See NTFS_RECORD definition above. */ /* 8*/ le64 lsn; /* $LogFile sequence number for this record. Changed every time the record is modified. */ /* 16*/ le16 sequence_number; /* Number of times this mft record has been reused. (See description for MFT_REF above.) NOTE: The increment (skipping zero) is done when the file is deleted. NOTE: If this is zero it is left zero. */ /* 18*/ le16 link_count; /* Number of hard links, i.e. the number of directory entries referencing this record. NOTE: Only used in mft base records. NOTE: When deleting a directory entry we check the link_count and if it is 1 we delete the file. Otherwise we delete the FILE_NAME_ATTR being referenced by the directory entry from the mft record and decrement the link_count. FIXME: Careful with Win32 + DOS names! */ /* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this mft record from the start of the mft record. NOTE: Must be aligned to 8-byte boundary. */ /* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file is deleted, the MFT_RECORD_IN_USE flag is set to zero. */ /* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. NOTE: Must be aligned to 8-byte boundary. */ /* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft record. This should be equal to the mft record size. */ /* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. When it is not zero it is a mft reference pointing to the base mft record to which this record belongs (this is then used to locate the attribute list attribute present in the base record which describes this extension record and hence might need modification when the extension record itself is modified, also locating the attribute list also means finding the other potential extents, belonging to the non-base mft record). */ /* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to the next attribute added to this mft record. NOTE: Incremented each time after it is used. NOTE: Every time the mft record is reused this number is set to zero. NOTE: The first instance number is always 0. */ /* The below fields are specific to NTFS 3.1+ (Windows XP and above): */ /* 42*/ le16 reserved; /* Reserved/alignment. */ /* 44*/ le32 mft_record_number; /* Number of this mft record. */ /* sizeof() = 48 bytes */ /* * When (re)using the mft record, we place the update sequence array at this * offset, i.e. before we start with the attributes. This also makes sense, * otherwise we could run into problems with the update sequence array * containing in itself the last two bytes of a sector which would mean that * multi sector transfer protection wouldn't work. As you can't protect data * by overwriting it since you then can't get it back... * When reading we obviously use the data from the ntfs record header. */ } __attribute__ ((__packed__)) MFT_RECORD; /* This is the version without the NTFS 3.1+ specific fields. */ typedef struct { /*Ofs*/ /* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ NTFS_RECORD_TYPE magic; /* Usually the magic is "FILE". */ le16 usa_ofs; /* See NTFS_RECORD definition above. */ le16 usa_count; /* See NTFS_RECORD definition above. */ /* 8*/ le64 lsn; /* $LogFile sequence number for this record. Changed every time the record is modified. */ /* 16*/ le16 sequence_number; /* Number of times this mft record has been reused. (See description for MFT_REF above.) NOTE: The increment (skipping zero) is done when the file is deleted. NOTE: If this is zero it is left zero. */ /* 18*/ le16 link_count; /* Number of hard links, i.e. the number of directory entries referencing this record. NOTE: Only used in mft base records. NOTE: When deleting a directory entry we check the link_count and if it is 1 we delete the file. Otherwise we delete the FILE_NAME_ATTR being referenced by the directory entry from the mft record and decrement the link_count. FIXME: Careful with Win32 + DOS names! */ /* 20*/ le16 attrs_offset; /* Byte offset to the first attribute in this mft record from the start of the mft record. NOTE: Must be aligned to 8-byte boundary. */ /* 22*/ MFT_RECORD_FLAGS flags; /* Bit array of MFT_RECORD_FLAGS. When a file is deleted, the MFT_RECORD_IN_USE flag is set to zero. */ /* 24*/ le32 bytes_in_use; /* Number of bytes used in this mft record. NOTE: Must be aligned to 8-byte boundary. */ /* 28*/ le32 bytes_allocated; /* Number of bytes allocated for this mft record. This should be equal to the mft record size. */ /* 32*/ leMFT_REF base_mft_record;/* This is zero for base mft records. When it is not zero it is a mft reference pointing to the base mft record to which this record belongs (this is then used to locate the attribute list attribute present in the base record which describes this extension record and hence might need modification when the extension record itself is modified, also locating the attribute list also means finding the other potential extents, belonging to the non-base mft record). */ /* 40*/ le16 next_attr_instance;/* The instance number that will be assigned to the next attribute added to this mft record. NOTE: Incremented each time after it is used. NOTE: Every time the mft record is reused this number is set to zero. NOTE: The first instance number is always 0. */ /* sizeof() = 42 bytes */ /* * When (re)using the mft record, we place the update sequence array at this * offset, i.e. before we start with the attributes. This also makes sense, * otherwise we could run into problems with the update sequence array * containing in itself the last two bytes of a sector which would mean that * multi sector transfer protection wouldn't work. As you can't protect data * by overwriting it since you then can't get it back... * When reading we obviously use the data from the ntfs record header. */ } __attribute__ ((__packed__)) MFT_RECORD_OLD; /* * System defined attributes (32-bit). Each attribute type has a corresponding * attribute name (Unicode string of maximum 64 character length) as described * by the attribute definitions present in the data attribute of the $AttrDef * system file. On NTFS 3.0 volumes the names are just as the types are named * in the below defines exchanging AT_ for the dollar sign ($). If that is not * a revealing choice of symbol I do not know what is... (-; */ enum { AT_UNUSED = cpu_to_le32( 0), AT_STANDARD_INFORMATION = cpu_to_le32( 0x10), AT_ATTRIBUTE_LIST = cpu_to_le32( 0x20), AT_FILE_NAME = cpu_to_le32( 0x30), AT_OBJECT_ID = cpu_to_le32( 0x40), AT_SECURITY_DESCRIPTOR = cpu_to_le32( 0x50), AT_VOLUME_NAME = cpu_to_le32( 0x60), AT_VOLUME_INFORMATION = cpu_to_le32( 0x70), AT_DATA = cpu_to_le32( 0x80), AT_INDEX_ROOT = cpu_to_le32( 0x90), AT_INDEX_ALLOCATION = cpu_to_le32( 0xa0), AT_BITMAP = cpu_to_le32( 0xb0), AT_REPARSE_POINT = cpu_to_le32( 0xc0), AT_EA_INFORMATION = cpu_to_le32( 0xd0), AT_EA = cpu_to_le32( 0xe0), AT_PROPERTY_SET = cpu_to_le32( 0xf0), AT_LOGGED_UTILITY_STREAM = cpu_to_le32( 0x100), AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32( 0x1000), AT_END = cpu_to_le32(0xffffffff) }; typedef le32 ATTR_TYPE; /* * The collation rules for sorting views/indexes/etc (32-bit). * * COLLATION_BINARY - Collate by binary compare where the first byte is most * significant. * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary * Unicode values, except that when a character can be uppercased, the * upper case value collates before the lower case one. * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation * is done very much like COLLATION_UNICODE_STRING. In fact I have no idea * what the difference is. Perhaps the difference is that file names * would treat some special characters in an odd way (see * unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[] * for what I mean but COLLATION_UNICODE_STRING would not give any special * treatment to any characters at all, but this is speculation. * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key * values. E.g. used for $SII index in FILE_Secure, which sorts by * security_id (le32). * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values. * E.g. used for $O index in FILE_Extend/$Quota. * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash * values and second by ascending security_id values. E.g. used for $SDH * index in FILE_Secure. * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending * le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which * sorts by object_id (16-byte), by splitting up the object_id in four * le32 values and using them as individual keys. E.g. take the following * two security_ids, stored as follows on disk: * 1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59 * 2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45 * To compare them, they are split into four le32 values each, like so: * 1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081 * 2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179 * Now, it is apparent why the 2nd object_id collates after the 1st: the * first le32 value of the 1st object_id is less than the first le32 of * the 2nd object_id. If the first le32 values of both object_ids were * equal then the second le32 values would be compared, etc. */ enum { COLLATION_BINARY = cpu_to_le32(0x00), COLLATION_FILE_NAME = cpu_to_le32(0x01), COLLATION_UNICODE_STRING = cpu_to_le32(0x02), COLLATION_NTOFS_ULONG = cpu_to_le32(0x10), COLLATION_NTOFS_SID = cpu_to_le32(0x11), COLLATION_NTOFS_SECURITY_HASH = cpu_to_le32(0x12), COLLATION_NTOFS_ULONGS = cpu_to_le32(0x13), }; typedef le32 COLLATION_RULE; /* * The flags (32-bit) describing attribute properties in the attribute * definition structure. FIXME: This information is based on Regis's * information and, according to him, it is not certain and probably * incomplete. The INDEXABLE flag is fairly certainly correct as only the file * name attribute has this flag set and this is the only attribute indexed in * NT4. */ enum { ATTR_DEF_INDEXABLE = cpu_to_le32(0x02), /* Attribute can be indexed. */ ATTR_DEF_MULTIPLE = cpu_to_le32(0x04), /* Attribute type can be present multiple times in the mft records of an inode. */ ATTR_DEF_NOT_ZERO = cpu_to_le32(0x08), /* Attribute value must contain at least one non-zero byte. */ ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be indexed and the attribute value must be unique for the attribute type in all of the mft records of an inode. */ ATTR_DEF_NAMED_UNIQUE = cpu_to_le32(0x20), /* Attribute must be named and the name must be unique for the attribute type in all of the mft records of an inode. */ ATTR_DEF_RESIDENT = cpu_to_le32(0x40), /* Attribute must be resident. */ ATTR_DEF_ALWAYS_LOG = cpu_to_le32(0x80), /* Always log modifications to this attribute, regardless of whether it is resident or non-resident. Without this, only log modifications if the attribute is resident. */ }; typedef le32 ATTR_DEF_FLAGS; /* * The data attribute of FILE_AttrDef contains a sequence of attribute * definitions for the NTFS volume. With this, it is supposed to be safe for an * older NTFS driver to mount a volume containing a newer NTFS version without * damaging it (that's the theory. In practice it's: not damaging it too much). * Entries are sorted by attribute type. The flags describe whether the * attribute can be resident/non-resident and possibly other things, but the * actual bits are unknown. */ typedef struct { /*hex ofs*/ /* 0*/ ntfschar name[0x40]; /* Unicode name of the attribute. Zero terminated. */ /* 80*/ ATTR_TYPE type; /* Type of the attribute. */ /* 84*/ le32 display_rule; /* Default display rule. FIXME: What does it mean? (AIA) */ /* 88*/ COLLATION_RULE collation_rule; /* Default collation rule. */ /* 8c*/ ATTR_DEF_FLAGS flags; /* Flags describing the attribute. */ /* 90*/ sle64 min_size; /* Optional minimum attribute size. */ /* 98*/ sle64 max_size; /* Maximum size of attribute. */ /* sizeof() = 0xa0 or 160 bytes */ } __attribute__ ((__packed__)) ATTR_DEF; /* * Attribute flags (16-bit). */ enum { ATTR_IS_COMPRESSED = cpu_to_le16(0x0001), ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method mask. Also, first illegal value. */ ATTR_IS_ENCRYPTED = cpu_to_le16(0x4000), ATTR_IS_SPARSE = cpu_to_le16(0x8000), } __attribute__ ((__packed__)); typedef le16 ATTR_FLAGS; /* * Attribute compression. * * Only the data attribute is ever compressed in the current ntfs driver in * Windows. Further, compression is only applied when the data attribute is * non-resident. Finally, to use compression, the maximum allowed cluster size * on a volume is 4kib. * * The compression method is based on independently compressing blocks of X * clusters, where X is determined from the compression_unit value found in the * non-resident attribute record header (more precisely: X = 2^compression_unit * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4). * * There are three different cases of how a compression block of X clusters * can be stored: * * 1) The data in the block is all zero (a sparse block): * This is stored as a sparse block in the runlist, i.e. the runlist * entry has length = X and lcn = -1. The mapping pairs array actually * uses a delta_lcn value length of 0, i.e. delta_lcn is not present at * all, which is then interpreted by the driver as lcn = -1. * NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then * the same principles apply as above, except that the length is not * restricted to being any particular value. * * 2) The data in the block is not compressed: * This happens when compression doesn't reduce the size of the block * in clusters. I.e. if compression has a small effect so that the * compressed data still occupies X clusters, then the uncompressed data * is stored in the block. * This case is recognised by the fact that the runlist entry has * length = X and lcn >= 0. The mapping pairs array stores this as * normal with a run length of X and some specific delta_lcn, i.e. * delta_lcn has to be present. * * 3) The data in the block is compressed: * The common case. This case is recognised by the fact that the run * list entry has length L < X and lcn >= 0. The mapping pairs array * stores this as normal with a run length of X and some specific * delta_lcn, i.e. delta_lcn has to be present. This runlist entry is * immediately followed by a sparse entry with length = X - L and * lcn = -1. The latter entry is to make up the vcn counting to the * full compression block size X. * * In fact, life is more complicated because adjacent entries of the same type * can be coalesced. This means that one has to keep track of the number of * clusters handled and work on a basis of X clusters at a time being one * block. An example: if length L > X this means that this particular runlist * entry contains a block of length X and part of one or more blocks of length * L - X. Another example: if length L < X, this does not necessarily mean that * the block is compressed as it might be that the lcn changes inside the block * and hence the following runlist entry describes the continuation of the * potentially compressed block. The block would be compressed if the * following runlist entry describes at least X - L sparse clusters, thus * making up the compression block length as described in point 3 above. (Of * course, there can be several runlist entries with small lengths so that the * sparse entry does not follow the first data containing entry with * length < X.) * * NOTE: At the end of the compressed attribute value, there most likely is not * just the right amount of data to make up a compression block, thus this data * is not even attempted to be compressed. It is just stored as is, unless * the number of clusters it occupies is reduced when compressed in which case * it is stored as a compressed compression block, complete with sparse * clusters at the end. */ /* * Flags of resident attributes (8-bit). */ enum { RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index (has implications for deleting and modifying the attribute). */ } __attribute__ ((__packed__)); typedef u8 RESIDENT_ATTR_FLAGS; /* * Attribute record header. Always aligned to 8-byte boundary. */ typedef struct { /*Ofs*/ /* 0*/ ATTR_TYPE type; /* The (32-bit) type of the attribute. */ /* 4*/ le32 length; /* Byte size of the resident part of the attribute (aligned to 8-byte boundary). Used to get to the next attribute. */ /* 8*/ u8 non_resident; /* If 0, attribute is resident. If 1, attribute is non-resident. */ /* 9*/ u8 name_length; /* Unicode character size of name of attribute. 0 if unnamed. */ /* 10*/ le16 name_offset; /* If name_length != 0, the byte offset to the beginning of the name from the attribute record. Note that the name is stored as a Unicode string. When creating, place offset just at the end of the record header. Then, follow with attribute value or mapping pairs array, resident and non-resident attributes respectively, aligning to an 8-byte boundary. */ /* 12*/ ATTR_FLAGS flags; /* Flags describing the attribute. */ /* 14*/ le16 instance; /* The instance of this attribute record. This number is unique within this mft record (see MFT_RECORD/next_attribute_instance notes in in mft.h for more details). */ /* 16*/ union { /* Resident attributes. */ struct { /* 16 */ le32 value_length;/* Byte size of attribute value. */ /* 20 */ le16 value_offset;/* Byte offset of the attribute value from the start of the attribute record. When creating, align to 8-byte boundary if we have a name present as this might not have a length of a multiple of 8-bytes. */ /* 22 */ RESIDENT_ATTR_FLAGS flags; /* See above. */ /* 23 */ s8 reserved; /* Reserved/alignment to 8-byte boundary. */ } __attribute__ ((__packed__)) resident; /* Non-resident attributes. */ struct { /* 16*/ leVCN lowest_vcn;/* Lowest valid virtual cluster number for this portion of the attribute value or 0 if this is the only extent (usually the case). - Only when an attribute list is used does lowest_vcn != 0 ever occur. */ /* 24*/ leVCN highest_vcn;/* Highest valid vcn of this extent of the attribute value. - Usually there is only one portion, so this usually equals the attribute value size in clusters minus 1. Can be -1 for zero length files. Can be 0 for "single extent" attributes. */ /* 32*/ le16 mapping_pairs_offset; /* Byte offset from the beginning of the structure to the mapping pairs array which contains the mappings between the vcns and the logical cluster numbers (lcns). When creating, place this at the end of this record header aligned to 8-byte boundary. */ /* 34*/ u8 compression_unit; /* The compression unit expressed as the log to the base 2 of the number of clusters in a compression unit. 0 means not compressed. (This effectively limits the compression unit size to be a power of two clusters.) WinNT4 only uses a value of 4. Sparse files have this set to 0 on XPSP2. */ /* 35*/ u8 reserved[5]; /* Align to 8-byte boundary. */ /* The sizes below are only used when lowest_vcn is zero, as otherwise it would be difficult to keep them up-to-date.*/ /* 40*/ sle64 allocated_size; /* Byte size of disk space allocated to hold the attribute value. Always is a multiple of the cluster size. When a file is compressed, this field is a multiple of the compression block size (2^compression_unit) and it represents the logically allocated space rather than the actual on disk usage. For this use the compressed_size (see below). */ /* 48*/ sle64 data_size; /* Byte size of the attribute value. Can be larger than allocated_size if attribute value is compressed or sparse. */ /* 56*/ sle64 initialized_size; /* Byte size of initialized portion of the attribute value. Usually equals data_size. */ /* sizeof(uncompressed attr) = 64*/ /* 64*/ sle64 compressed_size; /* Byte size of the attribute value after compression. Only present when compressed or sparse. Always is a multiple of the cluster size. Represents the actual amount of disk space being used on the disk. */ /* sizeof(compressed attr) = 72*/ } __attribute__ ((__packed__)) non_resident; } __attribute__ ((__packed__)) data; } __attribute__ ((__packed__)) ATTR_RECORD; typedef ATTR_RECORD ATTR_REC; /* * File attribute flags (32-bit) appearing in the file_attributes fields of the * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR * attributes of MFT_RECORDs and directory index entries. * * All of the below flags appear in the directory index entries but only some * appear in the STANDARD_INFORMATION attribute whilst only some others appear * in the FILENAME_ATTR attribute of MFT_RECORDs. Unless otherwise stated the * flags appear in all of the above. */ enum { FILE_ATTR_READONLY = cpu_to_le32(0x00000001), FILE_ATTR_HIDDEN = cpu_to_le32(0x00000002), FILE_ATTR_SYSTEM = cpu_to_le32(0x00000004), /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */ FILE_ATTR_DIRECTORY = cpu_to_le32(0x00000010), /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT. It is reserved for the DOS SUBDIRECTORY flag. */ FILE_ATTR_ARCHIVE = cpu_to_le32(0x00000020), FILE_ATTR_DEVICE = cpu_to_le32(0x00000040), FILE_ATTR_NORMAL = cpu_to_le32(0x00000080), FILE_ATTR_TEMPORARY = cpu_to_le32(0x00000100), FILE_ATTR_SPARSE_FILE = cpu_to_le32(0x00000200), FILE_ATTR_REPARSE_POINT = cpu_to_le32(0x00000400), FILE_ATTR_COMPRESSED = cpu_to_le32(0x00000800), FILE_ATTR_OFFLINE = cpu_to_le32(0x00001000), FILE_ATTR_NOT_CONTENT_INDEXED = cpu_to_le32(0x00002000), FILE_ATTR_ENCRYPTED = cpu_to_le32(0x00004000), FILE_ATTR_VALID_FLAGS = cpu_to_le32(0x00007fb7), /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the FILE_ATTR_DEVICE and preserves everything else. This mask is used to obtain all flags that are valid for reading. */ FILE_ATTR_VALID_SET_FLAGS = cpu_to_le32(0x000031a7), /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT, F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest. This mask is used to obtain all flags that are valid for setting. */ /* * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION * attribute of an mft record. */ FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT = cpu_to_le32(0x10000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this is a directory or not, i.e. whether it has an index root attribute or not. */ FILE_ATTR_DUP_VIEW_INDEX_PRESENT = cpu_to_le32(0x20000000), /* Note, this is a copy of the corresponding bit from the mft record, telling us whether this file has a view index present (eg. object id index, quota index, one of the security indexes or the encrypting filesystem related indexes). */ }; typedef le32 FILE_ATTR_FLAGS; /* * NOTE on times in NTFS: All times are in MS standard time format, i.e. they * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00 * universal coordinated time (UTC). (In Linux time starts 1st January 1970, * 00:00:00 UTC and is stored as the number of 1-second intervals since then.) */ /* * Attribute: Standard information (0x10). * * NOTE: Always resident. * NOTE: Present in all base file records on a volume. * NOTE: There is conflicting information about the meaning of each of the time * fields but the meaning as defined below has been verified to be * correct by practical experimentation on Windows NT4 SP6a and is hence * assumed to be the one and only correct interpretation. */ typedef struct { /*Ofs*/ /* 0*/ sle64 creation_time; /* Time file was created. Updated when a filename is changed(?). */ /* 8*/ sle64 last_data_change_time; /* Time the data attribute was last modified. */ /* 16*/ sle64 last_mft_change_time; /* Time this mft record was last modified. */ /* 24*/ sle64 last_access_time; /* Approximate time when the file was last accessed (obviously this is not updated on read-only volumes). In Windows this is only updated when accessed if some time delta has passed since the last update. Also, last access time updates can be disabled altogether for speed. */ /* 32*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ /* 36*/ union { /* NTFS 1.2 */ struct { /* 36*/ u8 reserved12[12]; /* Reserved/alignment to 8-byte boundary. */ } __attribute__ ((__packed__)) v1; /* sizeof() = 48 bytes */ /* NTFS 3.x */ struct { /* * If a volume has been upgraded from a previous NTFS version, then these * fields are present only if the file has been accessed since the upgrade. * Recognize the difference by comparing the length of the resident attribute * value. If it is 48, then the following fields are missing. If it is 72 then * the fields are present. Maybe just check like this: * if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) { * Assume NTFS 1.2- format. * If (volume version is 3.x) * Upgrade attribute to NTFS 3.x format. * else * Use NTFS 1.2- format for access. * } else * Use NTFS 3.x format for access. * Only problem is that it might be legal to set the length of the value to * arbitrarily large values thus spoiling this check. - But chkdsk probably * views that as a corruption, assuming that it behaves like this for all * attributes. */ /* 36*/ le32 maximum_versions; /* Maximum allowed versions for file. Zero if version numbering is disabled. */ /* 40*/ le32 version_number; /* This file's version (if any). Set to zero if maximum_versions is zero. */ /* 44*/ le32 class_id; /* Class id from bidirectional class id index (?). */ /* 48*/ le32 owner_id; /* Owner_id of the user owning the file. Translate via $Q index in FILE_Extend /$Quota to the quota control entry for the user owning the file. Zero if quotas are disabled. */ /* 52*/ le32 security_id; /* Security_id for the file. Translate via $SII index and $SDS data stream in FILE_Secure to the security descriptor. */ /* 56*/ le64 quota_charged; /* Byte size of the charge to the quota for all streams of the file. Note: Is zero if quotas are disabled. */ /* 64*/ leUSN usn; /* Last update sequence number of the file. This is a direct index into the transaction log file ($UsnJrnl). It is zero if the usn journal is disabled or this file has not been subject to logging yet. See usnjrnl.h for details. */ } __attribute__ ((__packed__)) v3; /* sizeof() = 72 bytes (NTFS 3.x) */ } __attribute__ ((__packed__)) ver; } __attribute__ ((__packed__)) STANDARD_INFORMATION; /* * Attribute: Attribute list (0x20). * * - Can be either resident or non-resident. * - Value consists of a sequence of variable length, 8-byte aligned, * ATTR_LIST_ENTRY records. * - The list is not terminated by anything at all! The only way to know when * the end is reached is to keep track of the current offset and compare it to * the attribute value size. * - The attribute list attribute contains one entry for each attribute of * the file in which the list is located, except for the list attribute * itself. The list is sorted: first by attribute type, second by attribute * name (if present), third by instance number. The extents of one * non-resident attribute (if present) immediately follow after the initial * extent. They are ordered by lowest_vcn and have their instace set to zero. * It is not allowed to have two attributes with all sorting keys equal. * - Further restrictions: * - If not resident, the vcn to lcn mapping array has to fit inside the * base mft record. * - The attribute list attribute value has a maximum size of 256kb. This * is imposed by the Windows cache manager. * - Attribute lists are only used when the attributes of mft record do not * fit inside the mft record despite all attributes (that can be made * non-resident) having been made non-resident. This can happen e.g. when: * - File has a large number of hard links (lots of file name * attributes present). * - The mapping pairs array of some non-resident attribute becomes so * large due to fragmentation that it overflows the mft record. * - The security descriptor is very complex (not applicable to * NTFS 3.0 volumes). * - There are many named streams. */ typedef struct { /*Ofs*/ /* 0*/ ATTR_TYPE type; /* Type of referenced attribute. */ /* 4*/ le16 length; /* Byte size of this entry (8-byte aligned). */ /* 6*/ u8 name_length; /* Size in Unicode chars of the name of the attribute or 0 if unnamed. */ /* 7*/ u8 name_offset; /* Byte offset to beginning of attribute name (always set this to where the name would start even if unnamed). */ /* 8*/ leVCN lowest_vcn; /* Lowest virtual cluster number of this portion of the attribute value. This is usually 0. It is non-zero for the case where one attribute does not fit into one mft record and thus several mft records are allocated to hold this attribute. In the latter case, each mft record holds one extent of the attribute and there is one attribute list entry for each extent. NOTE: This is DEFINITELY a signed value! The windows driver uses cmp, followed by jg when comparing this, thus it treats it as signed. */ /* 16*/ leMFT_REF mft_reference;/* The reference of the mft record holding the ATTR_RECORD for this portion of the attribute value. */ /* 24*/ le16 instance; /* If lowest_vcn = 0, the instance of the attribute being referenced; otherwise 0. */ /* 26*/ ntfschar name[0]; /* Use when creating only. When reading use name_offset to determine the location of the name. */ /* sizeof() = 26 + (attribute_name_length * 2) bytes */ } __attribute__ ((__packed__)) ATTR_LIST_ENTRY; /* * The maximum allowed length for a file name. */ #define MAXIMUM_FILE_NAME_LENGTH 255 /* * Possible namespaces for filenames in ntfs (8-bit). */ enum { FILE_NAME_POSIX = 0x00, /* This is the largest namespace. It is case sensitive and allows all Unicode characters except for: '\0' and '/'. Beware that in WinNT/2k/2003 by default files which eg have the same name except for their case will not be distinguished by the standard utilities and thus a "del filename" will delete both "filename" and "fileName" without warning. However if for example Services For Unix (SFU) are installed and the case sensitive option was enabled at installation time, then you can create/access/delete such files. Note that even SFU places restrictions on the filenames beyond the '\0' and '/' and in particular the following set of characters is not allowed: '"', '/', '<', '>', '\'. All other characters, including the ones no allowed in WIN32 namespace are allowed. Tested with SFU 3.5 (this is now free) running on Windows XP. */ FILE_NAME_WIN32 = 0x01, /* The standard WinNT/2k NTFS long filenames. Case insensitive. All Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\', and '|'. Further, names cannot end with a '.' or a space. */ FILE_NAME_DOS = 0x02, /* The standard DOS filenames (8.3 format). Uppercase only. All 8-bit characters greater space, except: '"', '*', '+', ',', '/', ':', ';', '<', '=', '>', '?', and '\'. */ FILE_NAME_WIN32_AND_DOS = 0x03, /* 3 means that both the Win32 and the DOS filenames are identical and hence have been saved in this single filename record. */ } __attribute__ ((__packed__)); typedef u8 FILE_NAME_TYPE_FLAGS; /* * Attribute: Filename (0x30). * * NOTE: Always resident. * NOTE: All fields, except the parent_directory, are only updated when the * filename is changed. Until then, they just become out of sync with * reality and the more up to date values are present in the standard * information attribute. * NOTE: There is conflicting information about the meaning of each of the time * fields but the meaning as defined below has been verified to be * correct by practical experimentation on Windows NT4 SP6a and is hence * assumed to be the one and only correct interpretation. */ typedef struct { /*hex ofs*/ /* 0*/ leMFT_REF parent_directory; /* Directory this filename is referenced from. */ /* 8*/ sle64 creation_time; /* Time file was created. */ /* 10*/ sle64 last_data_change_time; /* Time the data attribute was last modified. */ /* 18*/ sle64 last_mft_change_time; /* Time this mft record was last modified. */ /* 20*/ sle64 last_access_time; /* Time this mft record was last accessed. */ /* 28*/ sle64 allocated_size; /* Byte size of on-disk allocated space for the unnamed data attribute. So for normal $DATA, this is the allocated_size from the unnamed $DATA attribute and for compressed and/or sparse $DATA, this is the compressed_size from the unnamed $DATA attribute. For a directory or other inode without an unnamed $DATA attribute, this is always 0. NOTE: This is a multiple of the cluster size. */ /* 30*/ sle64 data_size; /* Byte size of actual data in unnamed data attribute. For a directory or other inode without an unnamed $DATA attribute, this is always 0. */ /* 38*/ FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */ /* 3c*/ union { /* 3c*/ struct { /* 3c*/ le16 packed_ea_size; /* Size of the buffer needed to pack the extended attributes (EAs), if such are present.*/ /* 3e*/ le16 reserved; /* Reserved for alignment. */ } __attribute__ ((__packed__)) ea; /* 3c*/ struct { /* 3c*/ le32 reparse_point_tag; /* Type of reparse point, present only in reparse points and only if there are no EAs. */ } __attribute__ ((__packed__)) rp; } __attribute__ ((__packed__)) type; /* 40*/ u8 file_name_length; /* Length of file name in (Unicode) characters. */ /* 41*/ FILE_NAME_TYPE_FLAGS file_name_type; /* Namespace of the file name.*/ /* 42*/ ntfschar file_name[0]; /* File name in Unicode. */ } __attribute__ ((__packed__)) FILE_NAME_ATTR; /* * GUID structures store globally unique identifiers (GUID). A GUID is a * 128-bit value consisting of one group of eight hexadecimal digits, followed * by three groups of four hexadecimal digits each, followed by one group of * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the * distributed computing environment (DCE) universally unique identifier (UUID). * Example of a GUID: * 1F010768-5A73-BC91-0010A52216A7 */ typedef struct { le32 data1; /* The first eight hexadecimal digits of the GUID. */ le16 data2; /* The first group of four hexadecimal digits. */ le16 data3; /* The second group of four hexadecimal digits. */ u8 data4[8]; /* The first two bytes are the third group of four hexadecimal digits. The remaining six bytes are the final 12 hexadecimal digits. */ } __attribute__ ((__packed__)) GUID; /* * FILE_Extend/$ObjId contains an index named $O. This index contains all * object_ids present on the volume as the index keys and the corresponding * mft_record numbers as the index entry data parts. The data part (defined * below) also contains three other object_ids: * birth_volume_id - object_id of FILE_Volume on which the file was first * created. Optional (i.e. can be zero). * birth_object_id - object_id of file when it was first created. Usually * equals the object_id. Optional (i.e. can be zero). * domain_id - Reserved (always zero). */ typedef struct { leMFT_REF mft_reference;/* Mft record containing the object_id in the index entry key. */ union { struct { GUID birth_volume_id; GUID birth_object_id; GUID domain_id; } __attribute__ ((__packed__)) origin; u8 extended_info[48]; } __attribute__ ((__packed__)) opt; } __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA; /* * Attribute: Object id (NTFS 3.0+) (0x40). * * NOTE: Always resident. */ typedef struct { GUID object_id; /* Unique id assigned to the file.*/ /* The following fields are optional. The attribute value size is 16 bytes, i.e. sizeof(GUID), if these are not present at all. Note, the entries can be present but one or more (or all) can be zero meaning that that particular value(s) is(are) not defined. */ union { struct { GUID birth_volume_id; /* Unique id of volume on which the file was first created.*/ GUID birth_object_id; /* Unique id of file when it was first created. */ GUID domain_id; /* Reserved, zero. */ } __attribute__ ((__packed__)) origin; u8 extended_info[48]; } __attribute__ ((__packed__)) opt; } __attribute__ ((__packed__)) OBJECT_ID_ATTR; /* * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in * the SID structure (see below). */ //typedef enum { /* SID string prefix. */ // SECURITY_NULL_SID_AUTHORITY = {0, 0, 0, 0, 0, 0}, /* S-1-0 */ // SECURITY_WORLD_SID_AUTHORITY = {0, 0, 0, 0, 0, 1}, /* S-1-1 */ // SECURITY_LOCAL_SID_AUTHORITY = {0, 0, 0, 0, 0, 2}, /* S-1-2 */ // SECURITY_CREATOR_SID_AUTHORITY = {0, 0, 0, 0, 0, 3}, /* S-1-3 */ // SECURITY_NON_UNIQUE_AUTHORITY = {0, 0, 0, 0, 0, 4}, /* S-1-4 */ // SECURITY_NT_SID_AUTHORITY = {0, 0, 0, 0, 0, 5}, /* S-1-5 */ //} IDENTIFIER_AUTHORITIES; /* * These relative identifiers (RIDs) are used with the above identifier * authorities to make up universal well-known SIDs. * * Note: The relative identifier (RID) refers to the portion of a SID, which * identifies a user or group in relation to the authority that issued the SID. * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and * the relative identifier SECURITY_CREATOR_OWNER_RID (0). */ typedef enum { /* Identifier authority. */ SECURITY_NULL_RID = 0, /* S-1-0 */ SECURITY_WORLD_RID = 0, /* S-1-1 */ SECURITY_LOCAL_RID = 0, /* S-1-2 */ SECURITY_CREATOR_OWNER_RID = 0, /* S-1-3 */ SECURITY_CREATOR_GROUP_RID = 1, /* S-1-3 */ SECURITY_CREATOR_OWNER_SERVER_RID = 2, /* S-1-3 */ SECURITY_CREATOR_GROUP_SERVER_RID = 3, /* S-1-3 */ SECURITY_DIALUP_RID = 1, SECURITY_NETWORK_RID = 2, SECURITY_BATCH_RID = 3, SECURITY_INTERACTIVE_RID = 4, SECURITY_SERVICE_RID = 6, SECURITY_ANONYMOUS_LOGON_RID = 7, SECURITY_PROXY_RID = 8, SECURITY_ENTERPRISE_CONTROLLERS_RID=9, SECURITY_SERVER_LOGON_RID = 9, SECURITY_PRINCIPAL_SELF_RID = 0xa, SECURITY_AUTHENTICATED_USER_RID = 0xb, SECURITY_RESTRICTED_CODE_RID = 0xc, SECURITY_TERMINAL_SERVER_RID = 0xd, SECURITY_LOGON_IDS_RID = 5, SECURITY_LOGON_IDS_RID_COUNT = 3, SECURITY_LOCAL_SYSTEM_RID = 0x12, SECURITY_NT_NON_UNIQUE = 0x15, SECURITY_BUILTIN_DOMAIN_RID = 0x20, /* * Well-known domain relative sub-authority values (RIDs). */ /* Users. */ DOMAIN_USER_RID_ADMIN = 0x1f4, DOMAIN_USER_RID_GUEST = 0x1f5, DOMAIN_USER_RID_KRBTGT = 0x1f6, /* Groups. */ DOMAIN_GROUP_RID_ADMINS = 0x200, DOMAIN_GROUP_RID_USERS = 0x201, DOMAIN_GROUP_RID_GUESTS = 0x202, DOMAIN_GROUP_RID_COMPUTERS = 0x203, DOMAIN_GROUP_RID_CONTROLLERS = 0x204, DOMAIN_GROUP_RID_CERT_ADMINS = 0x205, DOMAIN_GROUP_RID_SCHEMA_ADMINS = 0x206, DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207, DOMAIN_GROUP_RID_POLICY_ADMINS = 0x208, /* Aliases. */ DOMAIN_ALIAS_RID_ADMINS = 0x220, DOMAIN_ALIAS_RID_USERS = 0x221, DOMAIN_ALIAS_RID_GUESTS = 0x222, DOMAIN_ALIAS_RID_POWER_USERS = 0x223, DOMAIN_ALIAS_RID_ACCOUNT_OPS = 0x224, DOMAIN_ALIAS_RID_SYSTEM_OPS = 0x225, DOMAIN_ALIAS_RID_PRINT_OPS = 0x226, DOMAIN_ALIAS_RID_BACKUP_OPS = 0x227, DOMAIN_ALIAS_RID_REPLICATOR = 0x228, DOMAIN_ALIAS_RID_RAS_SERVERS = 0x229, DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a, } RELATIVE_IDENTIFIERS; /* * The universal well-known SIDs: * * NULL_SID S-1-0-0 * WORLD_SID S-1-1-0 * LOCAL_SID S-1-2-0 * CREATOR_OWNER_SID S-1-3-0 * CREATOR_GROUP_SID S-1-3-1 * CREATOR_OWNER_SERVER_SID S-1-3-2 * CREATOR_GROUP_SERVER_SID S-1-3-3 * * (Non-unique IDs) S-1-4 * * NT well-known SIDs: * * NT_AUTHORITY_SID S-1-5 * DIALUP_SID S-1-5-1 * * NETWORD_SID S-1-5-2 * BATCH_SID S-1-5-3 * INTERACTIVE_SID S-1-5-4 * SERVICE_SID S-1-5-6 * ANONYMOUS_LOGON_SID S-1-5-7 (aka null logon session) * PROXY_SID S-1-5-8 * SERVER_LOGON_SID S-1-5-9 (aka domain controller account) * SELF_SID S-1-5-10 (self RID) * AUTHENTICATED_USER_SID S-1-5-11 * RESTRICTED_CODE_SID S-1-5-12 (running restricted code) * TERMINAL_SERVER_SID S-1-5-13 (running on terminal server) * * (Logon IDs) S-1-5-5-X-Y * * (NT non-unique IDs) S-1-5-0x15-... * * (Built-in domain) S-1-5-0x20 */ /* * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure. * * NOTE: This is stored as a big endian number, hence the high_part comes * before the low_part. */ typedef union { struct { u16 high_part; /* High 16-bits. */ u32 low_part; /* Low 32-bits. */ } __attribute__ ((__packed__)) parts; u8 value[6]; /* Value as individual bytes. */ } __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY; /* * The SID structure is a variable-length structure used to uniquely identify * users or groups. SID stands for security identifier. * * The standard textual representation of the SID is of the form: * S-R-I-S-S... * Where: * - The first "S" is the literal character 'S' identifying the following * digits as a SID. * - R is the revision level of the SID expressed as a sequence of digits * either in decimal or hexadecimal (if the later, prefixed by "0x"). * - I is the 48-bit identifier_authority, expressed as digits as R above. * - S... is one or more sub_authority values, expressed as digits as above. * * Example SID; the domain-relative SID of the local Administrators group on * Windows NT/2k: * S-1-5-32-544 * This translates to a SID with: * revision = 1, * sub_authority_count = 2, * identifier_authority = {0,0,0,0,0,5}, // SECURITY_NT_AUTHORITY * sub_authority[0] = 32, // SECURITY_BUILTIN_DOMAIN_RID * sub_authority[1] = 544 // DOMAIN_ALIAS_RID_ADMINS */ typedef struct { u8 revision; u8 sub_authority_count; SID_IDENTIFIER_AUTHORITY identifier_authority; le32 sub_authority[1]; /* At least one sub_authority. */ } __attribute__ ((__packed__)) SID; /* * Current constants for SIDs. */ typedef enum { SID_REVISION = 1, /* Current revision level. */ SID_MAX_SUB_AUTHORITIES = 15, /* Maximum number of those. */ SID_RECOMMENDED_SUB_AUTHORITIES = 1, /* Will change to around 6 in a future revision. */ } SID_CONSTANTS; /* * The predefined ACE types (8-bit, see below). */ enum { ACCESS_MIN_MS_ACE_TYPE = 0, ACCESS_ALLOWED_ACE_TYPE = 0, ACCESS_DENIED_ACE_TYPE = 1, SYSTEM_AUDIT_ACE_TYPE = 2, SYSTEM_ALARM_ACE_TYPE = 3, /* Not implemented as of Win2k. */ ACCESS_MAX_MS_V2_ACE_TYPE = 3, ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4, ACCESS_MAX_MS_V3_ACE_TYPE = 4, /* The following are Win2k only. */ ACCESS_MIN_MS_OBJECT_ACE_TYPE = 5, ACCESS_ALLOWED_OBJECT_ACE_TYPE = 5, ACCESS_DENIED_OBJECT_ACE_TYPE = 6, SYSTEM_AUDIT_OBJECT_ACE_TYPE = 7, SYSTEM_ALARM_OBJECT_ACE_TYPE = 8, ACCESS_MAX_MS_OBJECT_ACE_TYPE = 8, ACCESS_MAX_MS_V4_ACE_TYPE = 8, /* This one is for WinNT/2k. */ ACCESS_MAX_MS_ACE_TYPE = 8, } __attribute__ ((__packed__)); typedef u8 ACE_TYPES; /* * The ACE flags (8-bit) for audit and inheritance (see below). * * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE * types to indicate that a message is generated (in Windows!) for successful * accesses. * * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types * to indicate that a message is generated (in Windows!) for failed accesses. */ enum { /* The inheritance flags. */ OBJECT_INHERIT_ACE = 0x01, CONTAINER_INHERIT_ACE = 0x02, NO_PROPAGATE_INHERIT_ACE = 0x04, INHERIT_ONLY_ACE = 0x08, INHERITED_ACE = 0x10, /* Win2k only. */ VALID_INHERIT_FLAGS = 0x1f, /* The audit flags. */ SUCCESSFUL_ACCESS_ACE_FLAG = 0x40, FAILED_ACCESS_ACE_FLAG = 0x80, } __attribute__ ((__packed__)); typedef u8 ACE_FLAGS; /* * An ACE is an access-control entry in an access-control list (ACL). * An ACE defines access to an object for a specific user or group or defines * the types of access that generate system-administration messages or alarms * for a specific user or group. The user or group is identified by a security * identifier (SID). * * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary), * which specifies the type and size of the ACE. The format of the subsequent * data depends on the ACE type. */ typedef struct { /*Ofs*/ /* 0*/ ACE_TYPES type; /* Type of the ACE. */ /* 1*/ ACE_FLAGS flags; /* Flags describing the ACE. */ /* 2*/ le16 size; /* Size in bytes of the ACE. */ } __attribute__ ((__packed__)) ACE_HEADER; /* * The access mask (32-bit). Defines the access rights. * * The specific rights (bits 0 to 15). These depend on the type of the object * being secured by the ACE. */ enum { /* Specific rights for files and directories are as follows: */ /* Right to read data from the file. (FILE) */ FILE_READ_DATA = cpu_to_le32(0x00000001), /* Right to list contents of a directory. (DIRECTORY) */ FILE_LIST_DIRECTORY = cpu_to_le32(0x00000001), /* Right to write data to the file. (FILE) */ FILE_WRITE_DATA = cpu_to_le32(0x00000002), /* Right to create a file in the directory. (DIRECTORY) */ FILE_ADD_FILE = cpu_to_le32(0x00000002), /* Right to append data to the file. (FILE) */ FILE_APPEND_DATA = cpu_to_le32(0x00000004), /* Right to create a subdirectory. (DIRECTORY) */ FILE_ADD_SUBDIRECTORY = cpu_to_le32(0x00000004), /* Right to read extended attributes. (FILE/DIRECTORY) */ FILE_READ_EA = cpu_to_le32(0x00000008), /* Right to write extended attributes. (FILE/DIRECTORY) */ FILE_WRITE_EA = cpu_to_le32(0x00000010), /* Right to execute a file. (FILE) */ FILE_EXECUTE = cpu_to_le32(0x00000020), /* Right to traverse the directory. (DIRECTORY) */ FILE_TRAVERSE = cpu_to_le32(0x00000020), /* * Right to delete a directory and all the files it contains (its * children), even if the files are read-only. (DIRECTORY) */ FILE_DELETE_CHILD = cpu_to_le32(0x00000040), /* Right to read file attributes. (FILE/DIRECTORY) */ FILE_READ_ATTRIBUTES = cpu_to_le32(0x00000080), /* Right to change file attributes. (FILE/DIRECTORY) */ FILE_WRITE_ATTRIBUTES = cpu_to_le32(0x00000100), /* * The standard rights (bits 16 to 23). These are independent of the * type of object being secured. */ /* Right to delete the object. */ DELETE = cpu_to_le32(0x00010000), /* * Right to read the information in the object's security descriptor, * not including the information in the SACL, i.e. right to read the * security descriptor and owner. */ READ_CONTROL = cpu_to_le32(0x00020000), /* Right to modify the DACL in the object's security descriptor. */ WRITE_DAC = cpu_to_le32(0x00040000), /* Right to change the owner in the object's security descriptor. */ WRITE_OWNER = cpu_to_le32(0x00080000), /* * Right to use the object for synchronization. Enables a process to * wait until the object is in the signalled state. Some object types * do not support this access right. */ SYNCHRONIZE = cpu_to_le32(0x00100000), /* * The following STANDARD_RIGHTS_* are combinations of the above for * convenience and are defined by the Win32 API. */ /* These are currently defined to READ_CONTROL. */ STANDARD_RIGHTS_READ = cpu_to_le32(0x00020000), STANDARD_RIGHTS_WRITE = cpu_to_le32(0x00020000), STANDARD_RIGHTS_EXECUTE = cpu_to_le32(0x00020000), /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */ STANDARD_RIGHTS_REQUIRED = cpu_to_le32(0x000f0000), /* * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and * SYNCHRONIZE access. */ STANDARD_RIGHTS_ALL = cpu_to_le32(0x001f0000), /* * The access system ACL and maximum allowed access types (bits 24 to * 25, bits 26 to 27 are reserved). */ ACCESS_SYSTEM_SECURITY = cpu_to_le32(0x01000000), MAXIMUM_ALLOWED = cpu_to_le32(0x02000000), /* * The generic rights (bits 28 to 31). These map onto the standard and * specific rights. */ /* Read, write, and execute access. */ GENERIC_ALL = cpu_to_le32(0x10000000), /* Execute access. */ GENERIC_EXECUTE = cpu_to_le32(0x20000000), /* * Write access. For files, this maps onto: * FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA | * FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE * For directories, the mapping has the same numerical value. See * above for the descriptions of the rights granted. */ GENERIC_WRITE = cpu_to_le32(0x40000000), /* * Read access. For files, this maps onto: * FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA | * STANDARD_RIGHTS_READ | SYNCHRONIZE * For directories, the mapping has the same numberical value. See * above for the descriptions of the rights granted. */ GENERIC_READ = cpu_to_le32(0x80000000), }; typedef le32 ACCESS_MASK; /* * The generic mapping array. Used to denote the mapping of each generic * access right to a specific access mask. * * FIXME: What exactly is this and what is it for? (AIA) */ typedef struct { ACCESS_MASK generic_read; ACCESS_MASK generic_write; ACCESS_MASK generic_execute; ACCESS_MASK generic_all; } __attribute__ ((__packed__)) GENERIC_MAPPING; /* * The predefined ACE type structures are as defined below. */ /* * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE */ typedef struct { /* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ ACE_TYPES type; /* Type of the ACE. */ ACE_FLAGS flags; /* Flags describing the ACE. */ le16 size; /* Size in bytes of the ACE. */ /* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ /* 8*/ SID sid; /* The SID associated with the ACE. */ } __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE; /* * The object ACE flags (32-bit). */ enum { ACE_OBJECT_TYPE_PRESENT = cpu_to_le32(1), ACE_INHERITED_OBJECT_TYPE_PRESENT = cpu_to_le32(2), }; typedef le32 OBJECT_ACE_FLAGS; typedef struct { /* 0 ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ ACE_TYPES type; /* Type of the ACE. */ ACE_FLAGS flags; /* Flags describing the ACE. */ le16 size; /* Size in bytes of the ACE. */ /* 4*/ ACCESS_MASK mask; /* Access mask associated with the ACE. */ /* 8*/ OBJECT_ACE_FLAGS object_flags; /* Flags describing the object ACE. */ /* 12*/ GUID object_type; /* 28*/ GUID inherited_object_type; /* 44*/ SID sid; /* The SID associated with the ACE. */ } __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE, ACCESS_DENIED_OBJECT_ACE, SYSTEM_AUDIT_OBJECT_ACE, SYSTEM_ALARM_OBJECT_ACE; /* * An ACL is an access-control list (ACL). * An ACL starts with an ACL header structure, which specifies the size of * the ACL and the number of ACEs it contains. The ACL header is followed by * zero or more access control entries (ACEs). The ACL as well as each ACE * are aligned on 4-byte boundaries. */ typedef struct { u8 revision; /* Revision of this ACL. */ u8 alignment1; le16 size; /* Allocated space in bytes for ACL. Includes this header, the ACEs and the remaining free space. */ le16 ace_count; /* Number of ACEs in the ACL. */ le16 alignment2; /* sizeof() = 8 bytes */ } __attribute__ ((__packed__)) ACL; /* * Current constants for ACLs. */ typedef enum { /* Current revision. */ ACL_REVISION = 2, ACL_REVISION_DS = 4, /* History of revisions. */ ACL_REVISION1 = 1, MIN_ACL_REVISION = 2, ACL_REVISION2 = 2, ACL_REVISION3 = 3, ACL_REVISION4 = 4, MAX_ACL_REVISION = 4, } ACL_CONSTANTS; /* * The security descriptor control flags (16-bit). * * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID * pointed to by the Owner field was provided by a defaulting mechanism * rather than explicitly provided by the original provider of the * security descriptor. This may affect the treatment of the SID with * respect to inheritance of an owner. * * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in * the Group field was provided by a defaulting mechanism rather than * explicitly provided by the original provider of the security * descriptor. This may affect the treatment of the SID with respect to * inheritance of a primary group. * * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security * descriptor contains a discretionary ACL. If this flag is set and the * Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is * explicitly being specified. * * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL * pointed to by the Dacl field was provided by a defaulting mechanism * rather than explicitly provided by the original provider of the * security descriptor. This may affect the treatment of the ACL with * respect to inheritance of an ACL. This flag is ignored if the * DaclPresent flag is not set. * * SE_SACL_PRESENT - This boolean flag, when set, indicates that the security * descriptor contains a system ACL pointed to by the Sacl field. If this * flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then * an empty (but present) ACL is being specified. * * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL * pointed to by the Sacl field was provided by a defaulting mechanism * rather than explicitly provided by the original provider of the * security descriptor. This may affect the treatment of the ACL with * respect to inheritance of an ACL. This flag is ignored if the * SaclPresent flag is not set. * * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security * descriptor is in self-relative form. In this form, all fields of the * security descriptor are contiguous in memory and all pointer fields are * expressed as offsets from the beginning of the security descriptor. */ enum { SE_OWNER_DEFAULTED = cpu_to_le16(0x0001), SE_GROUP_DEFAULTED = cpu_to_le16(0x0002), SE_DACL_PRESENT = cpu_to_le16(0x0004), SE_DACL_DEFAULTED = cpu_to_le16(0x0008), SE_SACL_PRESENT = cpu_to_le16(0x0010), SE_SACL_DEFAULTED = cpu_to_le16(0x0020), SE_DACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0100), SE_SACL_AUTO_INHERIT_REQ = cpu_to_le16(0x0200), SE_DACL_AUTO_INHERITED = cpu_to_le16(0x0400), SE_SACL_AUTO_INHERITED = cpu_to_le16(0x0800), SE_DACL_PROTECTED = cpu_to_le16(0x1000), SE_SACL_PROTECTED = cpu_to_le16(0x2000), SE_RM_CONTROL_VALID = cpu_to_le16(0x4000), SE_SELF_RELATIVE = cpu_to_le16(0x8000) } __attribute__ ((__packed__)); typedef le16 SECURITY_DESCRIPTOR_CONTROL; /* * Self-relative security descriptor. Contains the owner and group SIDs as well * as the sacl and dacl ACLs inside the security descriptor itself. */ typedef struct { u8 revision; /* Revision level of the security descriptor. */ u8 alignment; SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of the descriptor as well as the following fields. */ le32 owner; /* Byte offset to a SID representing an object's owner. If this is NULL, no owner SID is present in the descriptor. */ le32 group; /* Byte offset to a SID representing an object's primary group. If this is NULL, no primary group SID is present in the descriptor. */ le32 sacl; /* Byte offset to a system ACL. Only valid, if SE_SACL_PRESENT is set in the control field. If SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL is specified. */ le32 dacl; /* Byte offset to a discretionary ACL. Only valid, if SE_DACL_PRESENT is set in the control field. If SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL (unconditionally granting access) is specified. */ /* sizeof() = 0x14 bytes */ } __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE; /* * Absolute security descriptor. Does not contain the owner and group SIDs, nor * the sacl and dacl ACLs inside the security descriptor. Instead, it contains * pointers to these structures in memory. Obviously, absolute security * descriptors are only useful for in memory representations of security * descriptors. On disk, a self-relative security descriptor is used. */ typedef struct { u8 revision; /* Revision level of the security descriptor. */ u8 alignment; SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of the descriptor as well as the following fields. */ SID *owner; /* Points to a SID representing an object's owner. If this is NULL, no owner SID is present in the descriptor. */ SID *group; /* Points to a SID representing an object's primary group. If this is NULL, no primary group SID is present in the descriptor. */ ACL *sacl; /* Points to a system ACL. Only valid, if SE_SACL_PRESENT is set in the control field. If SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL is specified. */ ACL *dacl; /* Points to a discretionary ACL. Only valid, if SE_DACL_PRESENT is set in the control field. If SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL (unconditionally granting access) is specified. */ } __attribute__ ((__packed__)) SECURITY_DESCRIPTOR; /* * Current constants for security descriptors. */ typedef enum { /* Current revision. */ SECURITY_DESCRIPTOR_REVISION = 1, SECURITY_DESCRIPTOR_REVISION1 = 1, /* The sizes of both the absolute and relative security descriptors is the same as pointers, at least on ia32 architecture are 32-bit. */ SECURITY_DESCRIPTOR_MIN_LENGTH = sizeof(SECURITY_DESCRIPTOR), } SECURITY_DESCRIPTOR_CONSTANTS; /* * Attribute: Security descriptor (0x50). A standard self-relative security * descriptor. * * NOTE: Can be resident or non-resident. * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally * in FILE_Secure and the correct descriptor is found using the security_id * from the standard information attribute. */ typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR; /* * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one * referenced instance of each unique security descriptor is stored. * * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It * does, however, contain two indexes ($SDH and $SII) as well as a named data * stream ($SDS). * * Every unique security descriptor is assigned a unique security identifier * (security_id, not to be confused with a SID). The security_id is unique for * the NTFS volume and is used as an index into the $SII index, which maps * security_ids to the security descriptor's storage location within the $SDS * data attribute. The $SII index is sorted by ascending security_id. * * A simple hash is computed from each security descriptor. This hash is used * as an index into the $SDH index, which maps security descriptor hashes to * the security descriptor's storage location within the $SDS data attribute. * The $SDH index is sorted by security descriptor hash and is stored in a B+ * tree. When searching $SDH (with the intent of determining whether or not a * new security descriptor is already present in the $SDS data stream), if a * matching hash is found, but the security descriptors do not match, the * search in the $SDH index is continued, searching for a next matching hash. * * When a precise match is found, the security_id coresponding to the security * descriptor in the $SDS attribute is read from the found $SDH index entry and * is stored in the $STANDARD_INFORMATION attribute of the file/directory to * which the security descriptor is being applied. The $STANDARD_INFORMATION * attribute is present in all base mft records (i.e. in all files and * directories). * * If a match is not found, the security descriptor is assigned a new unique * security_id and is added to the $SDS data attribute. Then, entries * referencing the this security descriptor in the $SDS data attribute are * added to the $SDH and $SII indexes. * * Note: Entries are never deleted from FILE_Secure, even if nothing * references an entry any more. */ /* * This header precedes each security descriptor in the $SDS data stream. * This is also the index entry data part of both the $SII and $SDH indexes. */ typedef struct { le32 hash; /* Hash of the security descriptor. */ le32 security_id; /* The security_id assigned to the descriptor. */ le64 offset; /* Byte offset of this entry in the $SDS stream. */ le32 length; /* Size in bytes of this entry in $SDS stream. */ } __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER; /* * The $SDS data stream contains the security descriptors, aligned on 16-byte * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot * cross 256kib boundaries (this restriction is imposed by the Windows cache * manager). Each security descriptor is contained in a SDS_ENTRY structure. * Also, each security descriptor is stored twice in the $SDS stream with a * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the * the first copy of the security descriptor will be at offset 0x51d0 in the * $SDS data stream and the second copy will be at offset 0x451d0. */ typedef struct { /*Ofs*/ /* 0 SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */ le32 hash; /* Hash of the security descriptor. */ le32 security_id; /* The security_id assigned to the descriptor. */ le64 offset; /* Byte offset of this entry in the $SDS stream. */ le32 length; /* Size in bytes of this entry in $SDS stream. */ /* 20*/ SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security descriptor. */ } __attribute__ ((__packed__)) SDS_ENTRY; /* * The index entry key used in the $SII index. The collation type is * COLLATION_NTOFS_ULONG. */ typedef struct { le32 security_id; /* The security_id assigned to the descriptor. */ } __attribute__ ((__packed__)) SII_INDEX_KEY; /* * The index entry key used in the $SDH index. The keys are sorted first by * hash and then by security_id. The collation rule is * COLLATION_NTOFS_SECURITY_HASH. */ typedef struct { le32 hash; /* Hash of the security descriptor. */ le32 security_id; /* The security_id assigned to the descriptor. */ } __attribute__ ((__packed__)) SDH_INDEX_KEY; /* * Attribute: Volume name (0x60). * * NOTE: Always resident. * NOTE: Present only in FILE_Volume. */ typedef struct { ntfschar name[0]; /* The name of the volume in Unicode. */ } __attribute__ ((__packed__)) VOLUME_NAME; /* * Possible flags for the volume (16-bit). */ enum { VOLUME_IS_DIRTY = cpu_to_le16(0x0001), VOLUME_RESIZE_LOG_FILE = cpu_to_le16(0x0002), VOLUME_UPGRADE_ON_MOUNT = cpu_to_le16(0x0004), VOLUME_MOUNTED_ON_NT4 = cpu_to_le16(0x0008), VOLUME_DELETE_USN_UNDERWAY = cpu_to_le16(0x0010), VOLUME_REPAIR_OBJECT_ID = cpu_to_le16(0x0020), VOLUME_CHKDSK_UNDERWAY = cpu_to_le16(0x4000), VOLUME_MODIFIED_BY_CHKDSK = cpu_to_le16(0x8000), VOLUME_FLAGS_MASK = cpu_to_le16(0xc03f), /* To make our life easier when checking if we must mount read-only. */ VOLUME_MUST_MOUNT_RO_MASK = cpu_to_le16(0xc027), } __attribute__ ((__packed__)); typedef le16 VOLUME_FLAGS; /* * Attribute: Volume information (0x70). * * NOTE: Always resident. * NOTE: Present only in FILE_Volume. * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses * NTFS 1.2. I haven't personally seen other values yet. */ typedef struct { le64 reserved; /* Not used (yet?). */ u8 major_ver; /* Major version of the ntfs format. */ u8 minor_ver; /* Minor version of the ntfs format. */ VOLUME_FLAGS flags; /* Bit array of VOLUME_* flags. */ } __attribute__ ((__packed__)) VOLUME_INFORMATION; /* * Attribute: Data attribute (0x80). * * NOTE: Can be resident or non-resident. * * Data contents of a file (i.e. the unnamed stream) or of a named stream. */ typedef struct { u8 data[0]; /* The file's data contents. */ } __attribute__ ((__packed__)) DATA_ATTR; /* * Index header flags (8-bit). */ enum { /* * When index header is in an index root attribute: */ SMALL_INDEX = 0, /* The index is small enough to fit inside the index root attribute and there is no index allocation attribute present. */ LARGE_INDEX = 1, /* The index is too large to fit in the index root attribute and/or an index allocation attribute is present. */ /* * When index header is in an index block, i.e. is part of index * allocation attribute: */ LEAF_NODE = 0, /* This is a leaf node, i.e. there are no more nodes branching off it. */ INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf node. */ NODE_MASK = 1, /* Mask for accessing the *_NODE bits. */ } __attribute__ ((__packed__)); typedef u8 INDEX_HEADER_FLAGS; /* * This is the header for indexes, describing the INDEX_ENTRY records, which * follow the INDEX_HEADER. Together the index header and the index entries * make up a complete index. * * IMPORTANT NOTE: The offset, length and size structure members are counted * relative to the start of the index header structure and not relative to the * start of the index root or index allocation structures themselves. */ typedef struct { le32 entries_offset; /* Byte offset to first INDEX_ENTRY aligned to 8-byte boundary. */ le32 index_length; /* Data size of the index in bytes, i.e. bytes used from allocated size, aligned to 8-byte boundary. */ le32 allocated_size; /* Byte size of this index (block), multiple of 8 bytes. */ /* NOTE: For the index root attribute, the above two numbers are always equal, as the attribute is resident and it is resized as needed. In the case of the index allocation attribute the attribute is not resident and hence the allocated_size is a fixed value and must equal the index_block_size specified by the INDEX_ROOT attribute corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK belongs to. */ INDEX_HEADER_FLAGS flags; /* Bit field of INDEX_HEADER_FLAGS. */ u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ } __attribute__ ((__packed__)) INDEX_HEADER; /* * Attribute: Index root (0x90). * * NOTE: Always resident. * * This is followed by a sequence of index entries (INDEX_ENTRY structures) * as described by the index header. * * When a directory is small enough to fit inside the index root then this * is the only attribute describing the directory. When the directory is too * large to fit in the index root, on the other hand, two additional attributes * are present: an index allocation attribute, containing sub-nodes of the B+ * directory tree (see below), and a bitmap attribute, describing which virtual * cluster numbers (vcns) in the index allocation attribute are in use by an * index block. * * NOTE: The root directory (FILE_root) contains an entry for itself. Other * directories do not contain entries for themselves, though. */ typedef struct { ATTR_TYPE type; /* Type of the indexed attribute. Is $FILE_NAME for directories, zero for view indexes. No other values allowed. */ COLLATION_RULE collation_rule; /* Collation rule used to sort the index entries. If type is $FILE_NAME, this must be COLLATION_FILE_NAME. */ le32 index_block_size; /* Size of each index block in bytes (in the index allocation attribute). */ u8 clusters_per_index_block; /* Cluster size of each index block (in the index allocation attribute), when an index block is >= than a cluster, otherwise this will be the log of the size (like how the encoding of the mft record size and the index record size found in the boot sector work). Has to be a power of 2. */ u8 reserved[3]; /* Reserved/align to 8-byte boundary. */ INDEX_HEADER index; /* Index header describing the following index entries. */ } __attribute__ ((__packed__)) INDEX_ROOT; /* * Attribute: Index allocation (0xa0). * * NOTE: Always non-resident (doesn't make sense to be resident anyway!). * * This is an array of index blocks. Each index block starts with an * INDEX_BLOCK structure containing an index header, followed by a sequence of * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER. */ typedef struct { /* 0 NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */ NTFS_RECORD_TYPE magic; /* Magic is "INDX". */ le16 usa_ofs; /* See NTFS_RECORD definition. */ le16 usa_count; /* See NTFS_RECORD definition. */ /* 8*/ sle64 lsn; /* $LogFile sequence number of the last modification of this index block. */ /* 16*/ leVCN index_block_vcn; /* Virtual cluster number of the index block. If the cluster_size on the volume is <= the index_block_size of the directory, index_block_vcn counts in units of clusters, and in units of sectors otherwise. */ /* 24*/ INDEX_HEADER index; /* Describes the following index entries. */ /* sizeof()= 40 (0x28) bytes */ /* * When creating the index block, we place the update sequence array at this * offset, i.e. before we start with the index entries. This also makes sense, * otherwise we could run into problems with the update sequence array * containing in itself the last two bytes of a sector which would mean that * multi sector transfer protection wouldn't work. As you can't protect data * by overwriting it since you then can't get it back... * When reading use the data from the ntfs record header. */ } __attribute__ ((__packed__)) INDEX_BLOCK; typedef INDEX_BLOCK INDEX_ALLOCATION; /* * The system file FILE_Extend/$Reparse contains an index named $R listing * all reparse points on the volume. The index entry keys are as defined * below. Note, that there is no index data associated with the index entries. * * The index entries are sorted by the index key file_id. The collation rule is * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the * primary key / is not a key at all. (AIA) */ typedef struct { le32 reparse_tag; /* Reparse point type (inc. flags). */ leMFT_REF file_id; /* Mft record of the file containing the reparse point attribute. */ } __attribute__ ((__packed__)) REPARSE_INDEX_KEY; /* * Quota flags (32-bit). * * The user quota flags. Names explain meaning. */ enum { QUOTA_FLAG_DEFAULT_LIMITS = cpu_to_le32(0x00000001), QUOTA_FLAG_LIMIT_REACHED = cpu_to_le32(0x00000002), QUOTA_FLAG_ID_DELETED = cpu_to_le32(0x00000004), QUOTA_FLAG_USER_MASK = cpu_to_le32(0x00000007), /* This is a bit mask for the user quota flags. */ /* * These flags are only present in the quota defaults index entry, i.e. * in the entry where owner_id = QUOTA_DEFAULTS_ID. */ QUOTA_FLAG_TRACKING_ENABLED = cpu_to_le32(0x00000010), QUOTA_FLAG_ENFORCEMENT_ENABLED = cpu_to_le32(0x00000020), QUOTA_FLAG_TRACKING_REQUESTED = cpu_to_le32(0x00000040), QUOTA_FLAG_LOG_THRESHOLD = cpu_to_le32(0x00000080), QUOTA_FLAG_LOG_LIMIT = cpu_to_le32(0x00000100), QUOTA_FLAG_OUT_OF_DATE = cpu_to_le32(0x00000200), QUOTA_FLAG_CORRUPT = cpu_to_le32(0x00000400), QUOTA_FLAG_PENDING_DELETES = cpu_to_le32(0x00000800), }; typedef le32 QUOTA_FLAGS; /* * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas * are on a per volume and per user basis. * * The $Q index contains one entry for each existing user_id on the volume. The * index key is the user_id of the user/group owning this quota control entry, * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the * owner_id, is found in the standard information attribute. The collation rule * for $Q is COLLATION_NTOFS_ULONG. * * The $O index contains one entry for each user/group who has been assigned * a quota on that volume. The index key holds the SID of the user_id the * entry belongs to, i.e. the owner_id. The collation rule for $O is * COLLATION_NTOFS_SID. * * The $O index entry data is the user_id of the user corresponding to the SID. * This user_id is used as an index into $Q to find the quota control entry * associated with the SID. * * The $Q index entry data is the quota control entry and is defined below. */ typedef struct { le32 version; /* Currently equals 2. */ QUOTA_FLAGS flags; /* Flags describing this quota entry. */ le64 bytes_used; /* How many bytes of the quota are in use. */ sle64 change_time; /* Last time this quota entry was changed. */ sle64 threshold; /* Soft quota (-1 if not limited). */ sle64 limit; /* Hard quota (-1 if not limited). */ sle64 exceeded_time; /* How long the soft quota has been exceeded. */ SID sid; /* The SID of the user/object associated with this quota entry. Equals zero for the quota defaults entry (and in fact on a WinXP volume, it is not present at all). */ } __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY; /* * Predefined owner_id values (32-bit). */ enum { QUOTA_INVALID_ID = cpu_to_le32(0x00000000), QUOTA_DEFAULTS_ID = cpu_to_le32(0x00000001), QUOTA_FIRST_USER_ID = cpu_to_le32(0x00000100), }; /* * Current constants for quota control entries. */ typedef enum { /* Current version. */ QUOTA_VERSION = 2, } QUOTA_CONTROL_ENTRY_CONSTANTS; /* * Index entry flags (16-bit). */ enum { INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a sub-node, i.e. a reference to an index block in form of a virtual cluster number (see below). */ INDEX_ENTRY_END = cpu_to_le16(2), /* This signifies the last entry in an index block. The index entry does not represent a file but it can point to a sub-node. */ INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16-bit. */ } __attribute__ ((__packed__)); typedef le16 INDEX_ENTRY_FLAGS; /* * This the index entry header (see below). */ typedef struct { /* 0*/ union { struct { /* Only valid when INDEX_ENTRY_END is not set. */ leMFT_REF indexed_file; /* The mft reference of the file described by this index entry. Used for directory indexes. */ } __attribute__ ((__packed__)) dir; struct { /* Used for views/indexes to find the entry's data. */ le16 data_offset; /* Data byte offset from this INDEX_ENTRY. Follows the index key. */ le16 data_length; /* Data length in bytes. */ le32 reservedV; /* Reserved (zero). */ } __attribute__ ((__packed__)) vi; } __attribute__ ((__packed__)) data; /* 8*/ le16 length; /* Byte size of this index entry, multiple of 8-bytes. */ /* 10*/ le16 key_length; /* Byte size of the key value, which is in the index entry. It follows field reserved. Not multiple of 8-bytes. */ /* 12*/ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ /* 14*/ le16 reserved; /* Reserved/align to 8-byte boundary. */ /* sizeof() = 16 bytes */ } __attribute__ ((__packed__)) INDEX_ENTRY_HEADER; /* * This is an index entry. A sequence of such entries follows each INDEX_HEADER * structure. Together they make up a complete index. The index follows either * an index root attribute or an index allocation attribute. * * NOTE: Before NTFS 3.0 only filename attributes were indexed. */ typedef struct { /*Ofs*/ /* 0 INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */ union { struct { /* Only valid when INDEX_ENTRY_END is not set. */ leMFT_REF indexed_file; /* The mft reference of the file described by this index entry. Used for directory indexes. */ } __attribute__ ((__packed__)) dir; struct { /* Used for views/indexes to find the entry's data. */ le16 data_offset; /* Data byte offset from this INDEX_ENTRY. Follows the index key. */ le16 data_length; /* Data length in bytes. */ le32 reservedV; /* Reserved (zero). */ } __attribute__ ((__packed__)) vi; } __attribute__ ((__packed__)) data; le16 length; /* Byte size of this index entry, multiple of 8-bytes. */ le16 key_length; /* Byte size of the key value, which is in the index entry. It follows field reserved. Not multiple of 8-bytes. */ INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */ le16 reserved; /* Reserved/align to 8-byte boundary. */ /* 16*/ union { /* The key of the indexed attribute. NOTE: Only present if INDEX_ENTRY_END bit in flags is not set. NOTE: On NTFS versions before 3.0 the only valid key is the FILE_NAME_ATTR. On NTFS 3.0+ the following additional index keys are defined: */ FILE_NAME_ATTR file_name;/* $I30 index in directories. */ SII_INDEX_KEY sii; /* $SII index in $Secure. */ SDH_INDEX_KEY sdh; /* $SDH index in $Secure. */ GUID object_id; /* $O index in FILE_Extend/$ObjId: The object_id of the mft record found in the data part of the index. */ REPARSE_INDEX_KEY reparse; /* $R index in FILE_Extend/$Reparse. */ SID sid; /* $O index in FILE_Extend/$Quota: SID of the owner of the user_id. */ le32 owner_id; /* $Q index in FILE_Extend/$Quota: user_id of the owner of the quota control entry in the data part of the index. */ } __attribute__ ((__packed__)) key; /* The (optional) index data is inserted here when creating. */ // leVCN vcn; /* If INDEX_ENTRY_NODE bit in flags is set, the last // eight bytes of this index entry contain the virtual // cluster number of the index block that holds the // entries immediately preceding the current entry (the // vcn references the corresponding cluster in the data // of the non-resident index allocation attribute). If // the key_length is zero, then the vcn immediately // follows the INDEX_ENTRY_HEADER. Regardless of // key_length, the address of the 8-byte boundary // aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by // (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN), // where sizeof(VCN) can be hardcoded as 8 if wanted. */ } __attribute__ ((__packed__)) INDEX_ENTRY; /* * Attribute: Bitmap (0xb0). * * Contains an array of bits (aka a bitfield). * * When used in conjunction with the index allocation attribute, each bit * corresponds to one index block within the index allocation attribute. Thus * the number of bits in the bitmap * index block size / cluster size is the * number of clusters in the index allocation attribute. */ typedef struct { u8 bitmap[0]; /* Array of bits. */ } __attribute__ ((__packed__)) BITMAP_ATTR; /* * The reparse point tag defines the type of the reparse point. It also * includes several flags, which further describe the reparse point. * * The reparse point tag is an unsigned 32-bit value divided in three parts: * * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of * the reparse point. * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use. * 3. The most significant three bits are flags describing the reparse point. * They are defined as follows: * bit 29: Name surrogate bit. If set, the filename is an alias for * another object in the system. * bit 30: High-latency bit. If set, accessing the first byte of data will * be slow. (E.g. the data is stored on a tape drive.) * bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User * defined tags have to use zero here. * * These are the predefined reparse point tags: */ enum { IO_REPARSE_TAG_IS_ALIAS = cpu_to_le32(0x20000000), IO_REPARSE_TAG_IS_HIGH_LATENCY = cpu_to_le32(0x40000000), IO_REPARSE_TAG_IS_MICROSOFT = cpu_to_le32(0x80000000), IO_REPARSE_TAG_RESERVED_ZERO = cpu_to_le32(0x00000000), IO_REPARSE_TAG_RESERVED_ONE = cpu_to_le32(0x00000001), IO_REPARSE_TAG_RESERVED_RANGE = cpu_to_le32(0x00000001), IO_REPARSE_TAG_NSS = cpu_to_le32(0x68000005), IO_REPARSE_TAG_NSS_RECOVER = cpu_to_le32(0x68000006), IO_REPARSE_TAG_SIS = cpu_to_le32(0x68000007), IO_REPARSE_TAG_DFS = cpu_to_le32(0x68000008), IO_REPARSE_TAG_MOUNT_POINT = cpu_to_le32(0x88000003), IO_REPARSE_TAG_HSM = cpu_to_le32(0xa8000004), IO_REPARSE_TAG_SYMBOLIC_LINK = cpu_to_le32(0xe8000000), IO_REPARSE_TAG_VALID_VALUES = cpu_to_le32(0xe000ffff), }; /* * Attribute: Reparse point (0xc0). * * NOTE: Can be resident or non-resident. */ typedef struct { le32 reparse_tag; /* Reparse point type (inc. flags). */ le16 reparse_data_length; /* Byte size of reparse data. */ le16 reserved; /* Align to 8-byte boundary. */ u8 reparse_data[0]; /* Meaning depends on reparse_tag. */ } __attribute__ ((__packed__)) REPARSE_POINT; /* * Attribute: Extended attribute (EA) information (0xd0). * * NOTE: Always resident. (Is this true???) */ typedef struct { le16 ea_length; /* Byte size of the packed extended attributes. */ le16 need_ea_count; /* The number of extended attributes which have the NEED_EA bit set. */ le32 ea_query_length; /* Byte size of the buffer required to query the extended attributes when calling ZwQueryEaFile() in Windows NT/2k. I.e. the byte size of the unpacked extended attributes. */ } __attribute__ ((__packed__)) EA_INFORMATION; /* * Extended attribute flags (8-bit). */ enum { NEED_EA = 0x80 /* If set the file to which the EA belongs cannot be interpreted without understanding the associates extended attributes. */ } __attribute__ ((__packed__)); typedef u8 EA_FLAGS; /* * Attribute: Extended attribute (EA) (0xe0). * * NOTE: Can be resident or non-resident. * * Like the attribute list and the index buffer list, the EA attribute value is * a sequence of EA_ATTR variable length records. */ typedef struct { le32 next_entry_offset; /* Offset to the next EA_ATTR. */ EA_FLAGS flags; /* Flags describing the EA. */ u8 ea_name_length; /* Length of the name of the EA in bytes excluding the '\0' byte terminator. */ le16 ea_value_length; /* Byte size of the EA's value. */ u8 ea_name[0]; /* Name of the EA. Note this is ASCII, not Unicode and it is zero terminated. */ u8 ea_value[0]; /* The value of the EA. Immediately follows the name. */ } __attribute__ ((__packed__)) EA_ATTR; /* * Attribute: Property set (0xf0). * * Intended to support Native Structure Storage (NSS) - a feature removed from * NTFS 3.0 during beta testing. */ typedef struct { /* Irrelevant as feature unused. */ } __attribute__ ((__packed__)) PROPERTY_SET; /* * Attribute: Logged utility stream (0x100). * * NOTE: Can be resident or non-resident. * * Operations on this attribute are logged to the journal ($LogFile) like * normal metadata changes. * * Used by the Encrypting File System (EFS). All encrypted files have this * attribute with the name $EFS. */ typedef struct { /* Can be anything the creator chooses. */ /* EFS uses it as follows: */ // FIXME: Type this info, verifying it along the way. (AIA) } __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR; #endif /* _LINUX_NTFS_LAYOUT_H */
1 1 3 3 2 2 1 1 1 1 3 3 2 1 1 1 1 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 /* * linux/fs/adfs/super.c * * Copyright (C) 1997-1999 Russell King * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/module.h> #include <linux/init.h> #include <linux/buffer_head.h> #include <linux/parser.h> #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/statfs.h> #include <linux/user_namespace.h> #include "adfs.h" #include "dir_f.h" #include "dir_fplus.h" #define ADFS_DEFAULT_OWNER_MASK S_IRWXU #define ADFS_DEFAULT_OTHER_MASK (S_IRWXG | S_IRWXO) void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...) { char error_buf[128]; va_list args; va_start(args, fmt); vsnprintf(error_buf, sizeof(error_buf), fmt, args); va_end(args); printk(KERN_CRIT "ADFS-fs error (device %s)%s%s: %s\n", sb->s_id, function ? ": " : "", function ? function : "", error_buf); } static int adfs_checkdiscrecord(struct adfs_discrecord *dr) { int i; /* sector size must be 256, 512 or 1024 bytes */ if (dr->log2secsize != 8 && dr->log2secsize != 9 && dr->log2secsize != 10) return 1; /* idlen must be at least log2secsize + 3 */ if (dr->idlen < dr->log2secsize + 3) return 1; /* we cannot have such a large disc that we * are unable to represent sector offsets in * 32 bits. This works out at 2.0 TB. */ if (le32_to_cpu(dr->disc_size_high) >> dr->log2secsize) return 1; /* idlen must be no greater than 19 v2 [1.0] */ if (dr->idlen > 19) return 1; /* reserved bytes should be zero */ for (i = 0; i < sizeof(dr->unused52); i++) if (dr->unused52[i] != 0) return 1; return 0; } static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) { unsigned int v0, v1, v2, v3; int i; v0 = v1 = v2 = v3 = 0; for (i = sb->s_blocksize - 4; i; i -= 4) { v0 += map[i] + (v3 >> 8); v3 &= 0xff; v1 += map[i + 1] + (v0 >> 8); v0 &= 0xff; v2 += map[i + 2] + (v1 >> 8); v1 &= 0xff; v3 += map[i + 3] + (v2 >> 8); v2 &= 0xff; } v0 += v3 >> 8; v1 += map[1] + (v0 >> 8); v2 += map[2] + (v1 >> 8); v3 += map[3] + (v2 >> 8); return v0 ^ v1 ^ v2 ^ v3; } static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) { unsigned char crosscheck = 0, zonecheck = 1; int i; for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { unsigned char *map; map = dm[i].dm_bh->b_data; if (adfs_calczonecheck(sb, map) != map[0]) { adfs_error(sb, "zone %d fails zonecheck", i); zonecheck = 0; } crosscheck ^= map[3]; } if (crosscheck != 0xff) adfs_error(sb, "crosscheck != 0xff"); return crosscheck == 0xff && zonecheck; } static void adfs_put_super(struct super_block *sb) { int i; struct adfs_sb_info *asb = ADFS_SB(sb); for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); kfree_rcu(asb, rcu); } static int adfs_show_options(struct seq_file *seq, struct dentry *root) { struct adfs_sb_info *asb = ADFS_SB(root->d_sb); if (!uid_eq(asb->s_uid, GLOBAL_ROOT_UID)) seq_printf(seq, ",uid=%u", from_kuid_munged(&init_user_ns, asb->s_uid)); if (!gid_eq(asb->s_gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, asb->s_gid)); if (asb->s_owner_mask != ADFS_DEFAULT_OWNER_MASK) seq_printf(seq, ",ownmask=%o", asb->s_owner_mask); if (asb->s_other_mask != ADFS_DEFAULT_OTHER_MASK) seq_printf(seq, ",othmask=%o", asb->s_other_mask); if (asb->s_ftsuffix != 0) seq_printf(seq, ",ftsuffix=%u", asb->s_ftsuffix); return 0; } enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_ftsuffix, Opt_err}; static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_ownmask, "ownmask=%o"}, {Opt_othmask, "othmask=%o"}, {Opt_ftsuffix, "ftsuffix=%u"}, {Opt_err, NULL} }; static int parse_options(struct super_block *sb, char *options) { char *p; struct adfs_sb_info *asb = ADFS_SB(sb); int option; if (!options) return 0; while ((p = strsep(&options, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case Opt_uid: if (match_int(args, &option)) return -EINVAL; asb->s_uid = make_kuid(current_user_ns(), option); if (!uid_valid(asb->s_uid)) return -EINVAL; break; case Opt_gid: if (match_int(args, &option)) return -EINVAL; asb->s_gid = make_kgid(current_user_ns(), option); if (!gid_valid(asb->s_gid)) return -EINVAL; break; case Opt_ownmask: if (match_octal(args, &option)) return -EINVAL; asb->s_owner_mask = option; break; case Opt_othmask: if (match_octal(args, &option)) return -EINVAL; asb->s_other_mask = option; break; case Opt_ftsuffix: if (match_int(args, &option)) return -EINVAL; asb->s_ftsuffix = option; break; default: printk("ADFS-fs: unrecognised mount option \"%s\" " "or missing value\n", p); return -EINVAL; } } return 0; } static int adfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); *flags |= MS_NODIRATIME; return parse_options(sb, data); } static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct adfs_sb_info *sbi = ADFS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = ADFS_SUPER_MAGIC; buf->f_namelen = sbi->s_namelen; buf->f_bsize = sb->s_blocksize; buf->f_blocks = sbi->s_size; buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; buf->f_bavail = buf->f_bfree = adfs_map_free(sb); buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); return 0; } static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void adfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } static void adfs_destroy_inode(struct inode *inode) { call_rcu(&inode->i_rcu, adfs_i_callback); } static void init_once(void *foo) { struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int __init init_inodecache(void) { adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (adfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(adfs_inode_cachep); } static const struct super_operations adfs_sops = { .alloc_inode = adfs_alloc_inode, .destroy_inode = adfs_destroy_inode, .write_inode = adfs_write_inode, .put_super = adfs_put_super, .statfs = adfs_statfs, .remount_fs = adfs_remount, .show_options = adfs_show_options, }; static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) { struct adfs_discmap *dm; unsigned int map_addr, zone_size, nzones; int i, zone; struct adfs_sb_info *asb = ADFS_SB(sb); nzones = asb->s_map_size; zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); map_addr = (nzones >> 1) * zone_size - ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); map_addr = signed_asl(map_addr, asb->s_map2blk); asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL); if (dm == NULL) { adfs_error(sb, "not enough memory"); return ERR_PTR(-ENOMEM); } for (zone = 0; zone < nzones; zone++, map_addr++) { dm[zone].dm_startbit = 0; dm[zone].dm_endbit = zone_size; dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; dm[zone].dm_bh = sb_bread(sb, map_addr); if (!dm[zone].dm_bh) { adfs_error(sb, "unable to read map"); goto error_free; } } /* adjust the limits for the first and last map zones */ i = zone - 1; dm[0].dm_startblk = 0; dm[0].dm_startbit = ADFS_DR_SIZE_BITS; dm[i].dm_endbit = (le32_to_cpu(dr->disc_size_high) << (32 - dr->log2bpmb)) + (le32_to_cpu(dr->disc_size) >> dr->log2bpmb) + (ADFS_DR_SIZE_BITS - i * zone_size); if (adfs_checkmap(sb, dm)) return dm; adfs_error(sb, "map corrupted"); error_free: while (--zone >= 0) brelse(dm[zone].dm_bh); kfree(dm); return ERR_PTR(-EIO); } static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits) { unsigned long discsize; discsize = le32_to_cpu(dr->disc_size_high) << (32 - block_bits); discsize |= le32_to_cpu(dr->disc_size) >> block_bits; return discsize; } static int adfs_fill_super(struct super_block *sb, void *data, int silent) { struct adfs_discrecord *dr; struct buffer_head *bh; struct object_info root_obj; unsigned char *b_data; unsigned int blocksize; struct adfs_sb_info *asb; struct inode *root; int ret = -EINVAL; sb->s_flags |= MS_NODIRATIME; asb = kzalloc(sizeof(*asb), GFP_KERNEL); if (!asb) return -ENOMEM; sb->s_fs_info = asb; /* set default options */ asb->s_uid = GLOBAL_ROOT_UID; asb->s_gid = GLOBAL_ROOT_GID; asb->s_owner_mask = ADFS_DEFAULT_OWNER_MASK; asb->s_other_mask = ADFS_DEFAULT_OTHER_MASK; asb->s_ftsuffix = 0; if (parse_options(sb, data)) goto error; sb_set_blocksize(sb, BLOCK_SIZE); if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) { adfs_error(sb, "unable to read superblock"); ret = -EIO; goto error; } b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE); if (adfs_checkbblk(b_data)) { if (!silent) printk("VFS: Can't find an adfs filesystem on dev " "%s.\n", sb->s_id); ret = -EINVAL; goto error_free_bh; } dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); /* * Do some sanity checks on the ADFS disc record */ if (adfs_checkdiscrecord(dr)) { if (!silent) printk("VPS: Can't find an adfs filesystem on dev " "%s.\n", sb->s_id); ret = -EINVAL; goto error_free_bh; } blocksize = 1 << dr->log2secsize; brelse(bh); if (sb_set_blocksize(sb, blocksize)) { bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize); if (!bh) { adfs_error(sb, "couldn't read superblock on " "2nd try."); ret = -EIO; goto error; } b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); if (adfs_checkbblk(b_data)) { adfs_error(sb, "disc record mismatch, very weird!"); ret = -EINVAL; goto error_free_bh; } dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); } else { if (!silent) printk(KERN_ERR "VFS: Unsupported blocksize on dev " "%s.\n", sb->s_id); ret = -EINVAL; goto error; } /* * blocksize on this device should now be set to the ADFS log2secsize */ sb->s_magic = ADFS_SUPER_MAGIC; asb->s_idlen = dr->idlen; asb->s_map_size = dr->nzones | (dr->nzones_high << 8); asb->s_map2blk = dr->log2bpmb - dr->log2secsize; asb->s_size = adfs_discsize(dr, sb->s_blocksize_bits); asb->s_version = dr->format_version; asb->s_log2sharesize = dr->log2sharesize; asb->s_map = adfs_read_map(sb, dr); if (IS_ERR(asb->s_map)) { ret = PTR_ERR(asb->s_map); goto error_free_bh; } brelse(bh); /* * set up enough so that we can read an inode */ sb->s_op = &adfs_sops; dr = (struct adfs_discrecord *)(asb->s_map[0].dm_bh->b_data + 4); root_obj.parent_id = root_obj.file_id = le32_to_cpu(dr->root); root_obj.name_len = 0; /* Set root object date as 01 Jan 1987 00:00:00 */ root_obj.loadaddr = 0xfff0003f; root_obj.execaddr = 0xec22c000; root_obj.size = ADFS_NEWDIR_SIZE; root_obj.attr = ADFS_NDA_DIRECTORY | ADFS_NDA_OWNER_READ | ADFS_NDA_OWNER_WRITE | ADFS_NDA_PUBLIC_READ; root_obj.filetype = -1; /* * If this is a F+ disk with variable length directories, * get the root_size from the disc record. */ if (asb->s_version) { root_obj.size = le32_to_cpu(dr->root_size); asb->s_dir = &adfs_fplus_dir_ops; asb->s_namelen = ADFS_FPLUS_NAME_LEN; } else { asb->s_dir = &adfs_f_dir_ops; asb->s_namelen = ADFS_F_NAME_LEN; } /* * ,xyz hex filetype suffix may be added by driver * to files that have valid RISC OS filetype */ if (asb->s_ftsuffix) asb->s_namelen += 4; sb->s_d_op = &adfs_dentry_operations; root = adfs_iget(sb, &root_obj); sb->s_root = d_make_root(root); if (!sb->s_root) { int i; for (i = 0; i < asb->s_map_size; i++) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); adfs_error(sb, "get root inode failed\n"); ret = -EIO; goto error; } return 0; error_free_bh: brelse(bh); error: sb->s_fs_info = NULL; kfree(asb); return ret; } static struct dentry *adfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, adfs_fill_super); } static struct file_system_type adfs_fs_type = { .owner = THIS_MODULE, .name = "adfs", .mount = adfs_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("adfs"); static int __init init_adfs_fs(void) { int err = init_inodecache(); if (err) goto out1; err = register_filesystem(&adfs_fs_type); if (err) goto out; return 0; out: destroy_inodecache(); out1: return err; } static void __exit exit_adfs_fs(void) { unregister_filesystem(&adfs_fs_type); destroy_inodecache(); } module_init(init_adfs_fs) module_exit(exit_adfs_fs) MODULE_LICENSE("GPL");
16 20 126 656 222 189 23 35 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_H #define _LINUX_TTY_H #include <linux/fs.h> #include <linux/major.h> #include <linux/termios.h> #include <linux/workqueue.h> #include <linux/tty_driver.h> #include <linux/tty_ldisc.h> #include <linux/mutex.h> #include <linux/tty_flags.h> #include <uapi/linux/tty.h> #include <linux/rwsem.h> #include <linux/llist.h> /* * Lock subclasses for tty locks * * TTY_LOCK_NORMAL is for normal ttys and master ptys. * TTY_LOCK_SLAVE is for slave ptys only. * * Lock subclasses are necessary for handling nested locking with pty pairs. * tty locks which use nested locking: * * legacy_mutex - Nested tty locks are necessary for releasing pty pairs. * The stable lock order is master pty first, then slave pty. * termios_rwsem - The stable lock order is tty_buffer lock->termios_rwsem. * Subclassing this lock enables the slave pty to hold its * termios_rwsem when claiming the master tty_buffer lock. * tty_buffer lock - slave ptys can claim nested buffer lock when handling * signal chars. The stable lock order is slave pty, then * master. */ enum { TTY_LOCK_NORMAL = 0, TTY_LOCK_SLAVE, }; /* * (Note: the *_driver.minor_start values 1, 64, 128, 192 are * hardcoded at present.) */ #define NR_UNIX98_PTY_DEFAULT 4096 /* Default maximum for Unix98 ptys */ #define NR_UNIX98_PTY_RESERVE 1024 /* Default reserve for main devpts */ #define NR_UNIX98_PTY_MAX (1 << MINORBITS) /* Absolute limit */ /* * This character is the same as _POSIX_VDISABLE: it cannot be used as * a c_cc[] character, but indicates that a particular special character * isn't in use (eg VINTR has no character etc) */ #define __DISABLED_CHAR '\0' struct tty_buffer { union { struct tty_buffer *next; struct llist_node free; }; int used; int size; int commit; int read; int flags; /* Data points here */ unsigned long data[]; }; /* Values for .flags field of tty_buffer */ #define TTYB_NORMAL 1 /* buffer has no flags buffer */ static inline unsigned char *char_buf_ptr(struct tty_buffer *b, int ofs) { return ((unsigned char *)b->data) + ofs; } static inline char *flag_buf_ptr(struct tty_buffer *b, int ofs) { return (char *)char_buf_ptr(b, ofs) + b->size; } struct tty_bufhead { struct tty_buffer *head; /* Queue head */ struct work_struct work; struct mutex lock; atomic_t priority; struct tty_buffer sentinel; struct llist_head free; /* Free queue head */ atomic_t mem_used; /* In-use buffers excluding free list */ int mem_limit; struct tty_buffer *tail; /* Active buffer */ }; /* * When a break, frame error, or parity error happens, these codes are * stuffed into the flags buffer. */ #define TTY_NORMAL 0 #define TTY_BREAK 1 #define TTY_FRAME 2 #define TTY_PARITY 3 #define TTY_OVERRUN 4 #define INTR_CHAR(tty) ((tty)->termios.c_cc[VINTR]) #define QUIT_CHAR(tty) ((tty)->termios.c_cc[VQUIT]) #define ERASE_CHAR(tty) ((tty)->termios.c_cc[VERASE]) #define KILL_CHAR(tty) ((tty)->termios.c_cc[VKILL]) #define EOF_CHAR(tty) ((tty)->termios.c_cc[VEOF]) #define TIME_CHAR(tty) ((tty)->termios.c_cc[VTIME]) #define MIN_CHAR(tty) ((tty)->termios.c_cc[VMIN]) #define SWTC_CHAR(tty) ((tty)->termios.c_cc[VSWTC]) #define START_CHAR(tty) ((tty)->termios.c_cc[VSTART]) #define STOP_CHAR(tty) ((tty)->termios.c_cc[VSTOP]) #define SUSP_CHAR(tty) ((tty)->termios.c_cc[VSUSP]) #define EOL_CHAR(tty) ((tty)->termios.c_cc[VEOL]) #define REPRINT_CHAR(tty) ((tty)->termios.c_cc[VREPRINT]) #define DISCARD_CHAR(tty) ((tty)->termios.c_cc[VDISCARD]) #define WERASE_CHAR(tty) ((tty)->termios.c_cc[VWERASE]) #define LNEXT_CHAR(tty) ((tty)->termios.c_cc[VLNEXT]) #define EOL2_CHAR(tty) ((tty)->termios.c_cc[VEOL2]) #define _I_FLAG(tty, f) ((tty)->termios.c_iflag & (f)) #define _O_FLAG(tty, f) ((tty)->termios.c_oflag & (f)) #define _C_FLAG(tty, f) ((tty)->termios.c_cflag & (f)) #define _L_FLAG(tty, f) ((tty)->termios.c_lflag & (f)) #define I_IGNBRK(tty) _I_FLAG((tty), IGNBRK) #define I_BRKINT(tty) _I_FLAG((tty), BRKINT) #define I_IGNPAR(tty) _I_FLAG((tty), IGNPAR) #define I_PARMRK(tty) _I_FLAG((tty), PARMRK) #define I_INPCK(tty) _I_FLAG((tty), INPCK) #define I_ISTRIP(tty) _I_FLAG((tty), ISTRIP) #define I_INLCR(tty) _I_FLAG((tty), INLCR) #define I_IGNCR(tty) _I_FLAG((tty), IGNCR) #define I_ICRNL(tty) _I_FLAG((tty), ICRNL) #define I_IUCLC(tty) _I_FLAG((tty), IUCLC) #define I_IXON(tty) _I_FLAG((tty), IXON) #define I_IXANY(tty) _I_FLAG((tty), IXANY) #define I_IXOFF(tty) _I_FLAG((tty), IXOFF) #define I_IMAXBEL(tty) _I_FLAG((tty), IMAXBEL) #define I_IUTF8(tty) _I_FLAG((tty), IUTF8) #define O_OPOST(tty) _O_FLAG((tty), OPOST) #define O_OLCUC(tty) _O_FLAG((tty), OLCUC) #define O_ONLCR(tty) _O_FLAG((tty), ONLCR) #define O_OCRNL(tty) _O_FLAG((tty), OCRNL) #define O_ONOCR(tty) _O_FLAG((tty), ONOCR) #define O_ONLRET(tty) _O_FLAG((tty), ONLRET) #define O_OFILL(tty) _O_FLAG((tty), OFILL) #define O_OFDEL(tty) _O_FLAG((tty), OFDEL) #define O_NLDLY(tty) _O_FLAG((tty), NLDLY) #define O_CRDLY(tty) _O_FLAG((tty), CRDLY) #define O_TABDLY(tty) _O_FLAG((tty), TABDLY) #define O_BSDLY(tty) _O_FLAG((tty), BSDLY) #define O_VTDLY(tty) _O_FLAG((tty), VTDLY) #define O_FFDLY(tty) _O_FLAG((tty), FFDLY) #define C_BAUD(tty) _C_FLAG((tty), CBAUD) #define C_CSIZE(tty) _C_FLAG((tty), CSIZE) #define C_CSTOPB(tty) _C_FLAG((tty), CSTOPB) #define C_CREAD(tty) _C_FLAG((tty), CREAD) #define C_PARENB(tty) _C_FLAG((tty), PARENB) #define C_PARODD(tty) _C_FLAG((tty), PARODD) #define C_HUPCL(tty) _C_FLAG((tty), HUPCL) #define C_CLOCAL(tty) _C_FLAG((tty), CLOCAL) #define C_CIBAUD(tty) _C_FLAG((tty), CIBAUD) #define C_CRTSCTS(tty) _C_FLAG((tty), CRTSCTS) #define C_CMSPAR(tty) _C_FLAG((tty), CMSPAR) #define L_ISIG(tty) _L_FLAG((tty), ISIG) #define L_ICANON(tty) _L_FLAG((tty), ICANON) #define L_XCASE(tty) _L_FLAG((tty), XCASE) #define L_ECHO(tty) _L_FLAG((tty), ECHO) #define L_ECHOE(tty) _L_FLAG((tty), ECHOE) #define L_ECHOK(tty) _L_FLAG((tty), ECHOK) #define L_ECHONL(tty) _L_FLAG((tty), ECHONL) #define L_NOFLSH(tty) _L_FLAG((tty), NOFLSH) #define L_TOSTOP(tty) _L_FLAG((tty), TOSTOP) #define L_ECHOCTL(tty) _L_FLAG((tty), ECHOCTL) #define L_ECHOPRT(tty) _L_FLAG((tty), ECHOPRT) #define L_ECHOKE(tty) _L_FLAG((tty), ECHOKE) #define L_FLUSHO(tty) _L_FLAG((tty), FLUSHO) #define L_PENDIN(tty) _L_FLAG((tty), PENDIN) #define L_IEXTEN(tty) _L_FLAG((tty), IEXTEN) #define L_EXTPROC(tty) _L_FLAG((tty), EXTPROC) struct device; struct signal_struct; /* * Port level information. Each device keeps its own port level information * so provide a common structure for those ports wanting to use common support * routines. * * The tty port has a different lifetime to the tty so must be kept apart. * In addition be careful as tty -> port mappings are valid for the life * of the tty object but in many cases port -> tty mappings are valid only * until a hangup so don't use the wrong path. */ struct tty_port; struct tty_port_operations { /* Return 1 if the carrier is raised */ int (*carrier_raised)(struct tty_port *port); /* Control the DTR line */ void (*dtr_rts)(struct tty_port *port, int raise); /* Called when the last close completes or a hangup finishes IFF the port was initialized. Do not use to free resources. Called under the port mutex to serialize against activate/shutdowns */ void (*shutdown)(struct tty_port *port); /* Called under the port mutex from tty_port_open, serialized using the port mutex */ /* FIXME: long term getting the tty argument *out* of this would be good for consoles */ int (*activate)(struct tty_port *port, struct tty_struct *tty); /* Called on the final put of a port */ void (*destruct)(struct tty_port *port); }; struct tty_port_client_operations { int (*receive_buf)(struct tty_port *port, const unsigned char *, const unsigned char *, size_t); void (*write_wakeup)(struct tty_port *port); }; extern const struct tty_port_client_operations tty_port_default_client_ops; struct tty_port { struct tty_bufhead buf; /* Locked internally */ struct tty_struct *tty; /* Back pointer */ struct tty_struct *itty; /* internal back ptr */ const struct tty_port_operations *ops; /* Port operations */ const struct tty_port_client_operations *client_ops; /* Port client operations */ spinlock_t lock; /* Lock protecting tty field */ int blocked_open; /* Waiting to open */ int count; /* Usage count */ wait_queue_head_t open_wait; /* Open waiters */ wait_queue_head_t delta_msr_wait; /* Modem status change */ unsigned long flags; /* User TTY flags ASYNC_ */ unsigned long iflags; /* Internal flags TTY_PORT_ */ unsigned char console:1, /* port is a console */ low_latency:1; /* optional: tune for latency */ struct mutex mutex; /* Locking */ struct mutex buf_mutex; /* Buffer alloc lock */ unsigned char *xmit_buf; /* Optional buffer */ unsigned int close_delay; /* Close port delay */ unsigned int closing_wait; /* Delay for output */ int drain_delay; /* Set to zero if no pure time based drain is needed else set to size of fifo */ struct kref kref; /* Ref counter */ void *client_data; }; /* tty_port::iflags bits -- use atomic bit ops */ #define TTY_PORT_INITIALIZED 0 /* device is initialized */ #define TTY_PORT_SUSPENDED 1 /* device is suspended */ #define TTY_PORT_ACTIVE 2 /* device is open */ /* * uart drivers: use the uart_port::status field and the UPSTAT_* defines * for s/w-based flow control steering and carrier detection status */ #define TTY_PORT_CTS_FLOW 3 /* h/w flow control enabled */ #define TTY_PORT_CHECK_CD 4 /* carrier detect enabled */ #define TTY_PORT_KOPENED 5 /* device exclusively opened by kernel */ /* * Where all of the state associated with a tty is kept while the tty * is open. Since the termios state should be kept even if the tty * has been closed --- for things like the baud rate, etc --- it is * not stored here, but rather a pointer to the real state is stored * here. Possible the winsize structure should have the same * treatment, but (1) the default 80x24 is usually right and (2) it's * most often used by a windowing system, which will set the correct * size each time the window is created or resized anyway. * - TYT, 9/14/92 */ struct tty_operations; struct tty_struct { int magic; struct kref kref; struct device *dev; struct tty_driver *driver; const struct tty_operations *ops; int index; /* Protects ldisc changes: Lock tty not pty */ struct ld_semaphore ldisc_sem; struct tty_ldisc *ldisc; struct mutex atomic_write_lock; struct mutex legacy_mutex; struct mutex throttle_mutex; struct rw_semaphore termios_rwsem; struct mutex winsize_mutex; spinlock_t ctrl_lock; spinlock_t flow_lock; /* Termios values are protected by the termios rwsem */ struct ktermios termios, termios_locked; struct termiox *termiox; /* May be NULL for unsupported */ char name[64]; struct pid *pgrp; /* Protected by ctrl lock */ /* * Writes protected by both ctrl lock and legacy mutex, readers must use * at least one of them. */ struct pid *session; unsigned long flags; int count; struct winsize winsize; /* winsize_mutex */ unsigned long stopped:1, /* flow_lock */ flow_stopped:1, unused:BITS_PER_LONG - 2; int hw_stopped; unsigned long ctrl_status:8, /* ctrl_lock */ packet:1, unused_ctrl:BITS_PER_LONG - 9; unsigned int receive_room; /* Bytes free for queue */ int flow_change; struct tty_struct *link; struct fasync_struct *fasync; wait_queue_head_t write_wait; wait_queue_head_t read_wait; struct work_struct hangup_work; void *disc_data; void *driver_data; spinlock_t files_lock; /* protects tty_files list */ struct list_head tty_files; #define N_TTY_BUF_SIZE 4096 int closing; unsigned char *write_buf; int write_cnt; /* If the tty has a pending do_SAK, queue it here - akpm */ struct work_struct SAK_work; struct tty_port *port; } __randomize_layout; /* Each of a tty's open files has private_data pointing to tty_file_private */ struct tty_file_private { struct tty_struct *tty; struct file *file; struct list_head list; }; /* tty magic number */ #define TTY_MAGIC 0x5401 /* * These bits are used in the flags field of the tty structure. * * So that interrupts won't be able to mess up the queues, * copy_to_cooked must be atomic with respect to itself, as must * tty->write. Thus, you must use the inline functions set_bit() and * clear_bit() to make things atomic. */ #define TTY_THROTTLED 0 /* Call unthrottle() at threshold min */ #define TTY_IO_ERROR 1 /* Cause an I/O error (may be no ldisc too) */ #define TTY_OTHER_CLOSED 2 /* Other side (if any) has closed */ #define TTY_EXCLUSIVE 3 /* Exclusive open mode */ #define TTY_DO_WRITE_WAKEUP 5 /* Call write_wakeup after queuing new */ #define TTY_LDISC_OPEN 11 /* Line discipline is open */ #define TTY_PTY_LOCK 16 /* pty private */ #define TTY_NO_WRITE_SPLIT 17 /* Preserve write boundaries to driver */ #define TTY_HUPPED 18 /* Post driver->hangup() */ #define TTY_HUPPING 19 /* Hangup in progress */ #define TTY_LDISC_CHANGING 20 /* Change pending - non-block IO */ #define TTY_LDISC_HALTED 22 /* Line discipline is halted */ /* Values for tty->flow_change */ #define TTY_THROTTLE_SAFE 1 #define TTY_UNTHROTTLE_SAFE 2 static inline void __tty_set_flow_change(struct tty_struct *tty, int val) { tty->flow_change = val; } static inline void tty_set_flow_change(struct tty_struct *tty, int val) { tty->flow_change = val; smp_mb(); } static inline bool tty_io_nonblock(struct tty_struct *tty, struct file *file) { return file->f_flags & O_NONBLOCK || test_bit(TTY_LDISC_CHANGING, &tty->flags); } static inline bool tty_io_error(struct tty_struct *tty) { return test_bit(TTY_IO_ERROR, &tty->flags); } static inline bool tty_throttled(struct tty_struct *tty) { return test_bit(TTY_THROTTLED, &tty->flags); } #ifdef CONFIG_TTY extern void tty_kref_put(struct tty_struct *tty); extern struct pid *tty_get_pgrp(struct tty_struct *tty); extern void tty_vhangup_self(void); extern void disassociate_ctty(int priv); extern dev_t tty_devnum(struct tty_struct *tty); extern void proc_clear_tty(struct task_struct *p); extern struct tty_struct *get_current_tty(void); /* tty_io.c */ extern int __init tty_init(void); extern const char *tty_name(const struct tty_struct *tty); extern struct tty_struct *tty_kopen(dev_t device); extern void tty_kclose(struct tty_struct *tty); extern int tty_dev_name_to_number(const char *name, dev_t *number); extern int tty_ldisc_lock(struct tty_struct *tty, unsigned long timeout); extern void tty_ldisc_unlock(struct tty_struct *tty); #else static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) { return NULL; } static inline void tty_vhangup_self(void) { } static inline void disassociate_ctty(int priv) { } static inline dev_t tty_devnum(struct tty_struct *tty) { return 0; } static inline void proc_clear_tty(struct task_struct *p) { } static inline struct tty_struct *get_current_tty(void) { return NULL; } /* tty_io.c */ static inline int __init tty_init(void) { return 0; } static inline const char *tty_name(const struct tty_struct *tty) { return "(none)"; } static inline struct tty_struct *tty_kopen(dev_t device) { return ERR_PTR(-ENODEV); } static inline void tty_kclose(struct tty_struct *tty) { } static inline int tty_dev_name_to_number(const char *name, dev_t *number) { return -ENOTSUPP; } #endif extern struct ktermios tty_std_termios; extern int vcs_init(void); extern struct class *tty_class; /** * tty_kref_get - get a tty reference * @tty: tty device * * Return a new reference to a tty object. The caller must hold * sufficient locks/counts to ensure that their existing reference cannot * go away */ static inline struct tty_struct *tty_kref_get(struct tty_struct *tty) { if (tty) kref_get(&tty->kref); return tty; } extern const char *tty_driver_name(const struct tty_struct *tty); extern void tty_wait_until_sent(struct tty_struct *tty, long timeout); extern int __tty_check_change(struct tty_struct *tty, int sig); extern int tty_check_change(struct tty_struct *tty); extern void __stop_tty(struct tty_struct *tty); extern void stop_tty(struct tty_struct *tty); extern void __start_tty(struct tty_struct *tty); extern void start_tty(struct tty_struct *tty); extern int tty_register_driver(struct tty_driver *driver); extern int tty_unregister_driver(struct tty_driver *driver); extern struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); extern struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern void tty_unregister_device(struct tty_driver *driver, unsigned index); extern void tty_write_message(struct tty_struct *tty, char *msg); extern int tty_send_xchar(struct tty_struct *tty, char ch); extern int tty_put_char(struct tty_struct *tty, unsigned char c); extern int tty_chars_in_buffer(struct tty_struct *tty); extern int tty_write_room(struct tty_struct *tty); extern void tty_driver_flush_buffer(struct tty_struct *tty); extern void tty_throttle(struct tty_struct *tty); extern void tty_unthrottle(struct tty_struct *tty); extern int tty_throttle_safe(struct tty_struct *tty); extern int tty_unthrottle_safe(struct tty_struct *tty); extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws); extern int is_current_pgrp_orphaned(void); extern void tty_hangup(struct tty_struct *tty); extern void tty_vhangup(struct tty_struct *tty); extern void tty_vhangup_session(struct tty_struct *tty); extern int tty_hung_up_p(struct file *filp); extern void do_SAK(struct tty_struct *tty); extern void __do_SAK(struct tty_struct *tty); extern void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty); extern int tty_signal_session_leader(struct tty_struct *tty, int exit_session); extern void session_clear_tty(struct pid *session); extern void no_tty(void); extern void tty_buffer_free_all(struct tty_port *port); extern void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld); extern void tty_buffer_init(struct tty_port *port); extern void tty_buffer_set_lock_subclass(struct tty_port *port); extern bool tty_buffer_restart_work(struct tty_port *port); extern bool tty_buffer_cancel_work(struct tty_port *port); extern void tty_buffer_flush_work(struct tty_port *port); extern speed_t tty_termios_baud_rate(struct ktermios *termios); extern speed_t tty_termios_input_baud_rate(struct ktermios *termios); extern void tty_termios_encode_baud_rate(struct ktermios *termios, speed_t ibaud, speed_t obaud); extern void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud); /** * tty_get_baud_rate - get tty bit rates * @tty: tty to query * * Returns the baud rate as an integer for this terminal. The * termios lock must be held by the caller and the terminal bit * flags may be updated. * * Locking: none */ static inline speed_t tty_get_baud_rate(struct tty_struct *tty) { return tty_termios_baud_rate(&tty->termios); } extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old); extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b); extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt); extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *); extern void tty_ldisc_deref(struct tty_ldisc *); extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *); extern void tty_ldisc_hangup(struct tty_struct *tty, bool reset); extern int tty_ldisc_reinit(struct tty_struct *tty, int disc); extern const struct file_operations tty_ldiscs_proc_fops; extern void tty_wakeup(struct tty_struct *tty); extern void tty_ldisc_flush(struct tty_struct *tty); extern long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); extern long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty, struct file *file, unsigned int cmd, unsigned long arg); extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg); extern void tty_default_fops(struct file_operations *fops); extern struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx); extern int tty_alloc_file(struct file *file); extern void tty_add_file(struct tty_struct *tty, struct file *file); extern void tty_free_file(struct file *file); extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx); extern void tty_release_struct(struct tty_struct *tty, int idx); extern int tty_release(struct inode *inode, struct file *filp); extern void tty_init_termios(struct tty_struct *tty); extern int tty_standard_install(struct tty_driver *driver, struct tty_struct *tty); extern struct mutex tty_mutex; #define tty_is_writelocked(tty) (mutex_is_locked(&tty->atomic_write_lock)) extern void tty_port_init(struct tty_port *port); extern void tty_port_link_device(struct tty_port *port, struct tty_driver *driver, unsigned index); extern struct device *tty_port_register_device(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device); extern struct device *tty_port_register_device_attr(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern struct device *tty_port_register_device_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device); extern struct device *tty_port_register_device_attr_serdev(struct tty_port *port, struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); extern void tty_port_unregister_device(struct tty_port *port, struct tty_driver *driver, unsigned index); extern int tty_port_alloc_xmit_buf(struct tty_port *port); extern void tty_port_free_xmit_buf(struct tty_port *port); extern void tty_port_destroy(struct tty_port *port); extern void tty_port_put(struct tty_port *port); static inline struct tty_port *tty_port_get(struct tty_port *port) { if (port && kref_get_unless_zero(&port->kref)) return port; return NULL; } /* If the cts flow control is enabled, return true. */ static inline bool tty_port_cts_enabled(struct tty_port *port) { return test_bit(TTY_PORT_CTS_FLOW, &port->iflags); } static inline void tty_port_set_cts_flow(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_CTS_FLOW, &port->iflags); else clear_bit(TTY_PORT_CTS_FLOW, &port->iflags); } static inline bool tty_port_active(struct tty_port *port) { return test_bit(TTY_PORT_ACTIVE, &port->iflags); } static inline void tty_port_set_active(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_ACTIVE, &port->iflags); else clear_bit(TTY_PORT_ACTIVE, &port->iflags); } static inline bool tty_port_check_carrier(struct tty_port *port) { return test_bit(TTY_PORT_CHECK_CD, &port->iflags); } static inline void tty_port_set_check_carrier(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_CHECK_CD, &port->iflags); else clear_bit(TTY_PORT_CHECK_CD, &port->iflags); } static inline bool tty_port_suspended(struct tty_port *port) { return test_bit(TTY_PORT_SUSPENDED, &port->iflags); } static inline void tty_port_set_suspended(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_SUSPENDED, &port->iflags); else clear_bit(TTY_PORT_SUSPENDED, &port->iflags); } static inline bool tty_port_initialized(struct tty_port *port) { return test_bit(TTY_PORT_INITIALIZED, &port->iflags); } static inline void tty_port_set_initialized(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_INITIALIZED, &port->iflags); else clear_bit(TTY_PORT_INITIALIZED, &port->iflags); } static inline bool tty_port_kopened(struct tty_port *port) { return test_bit(TTY_PORT_KOPENED, &port->iflags); } static inline void tty_port_set_kopened(struct tty_port *port, bool val) { if (val) set_bit(TTY_PORT_KOPENED, &port->iflags); else clear_bit(TTY_PORT_KOPENED, &port->iflags); } extern struct tty_struct *tty_port_tty_get(struct tty_port *port); extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty); extern int tty_port_carrier_raised(struct tty_port *port); extern void tty_port_raise_dtr_rts(struct tty_port *port); extern void tty_port_lower_dtr_rts(struct tty_port *port); extern void tty_port_hangup(struct tty_port *port); extern void tty_port_tty_hangup(struct tty_port *port, bool check_clocal); extern void tty_port_tty_wakeup(struct tty_port *port); extern int tty_port_block_til_ready(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern void tty_port_close_end(struct tty_port *port, struct tty_struct *tty); extern void tty_port_close(struct tty_port *port, struct tty_struct *tty, struct file *filp); extern int tty_port_install(struct tty_port *port, struct tty_driver *driver, struct tty_struct *tty); extern int tty_port_open(struct tty_port *port, struct tty_struct *tty, struct file *filp); static inline int tty_port_users(struct tty_port *port) { return port->count + port->blocked_open; } extern int tty_register_ldisc(int disc, struct tty_ldisc_ops *new_ldisc); extern int tty_unregister_ldisc(int disc); extern int tty_set_ldisc(struct tty_struct *tty, int disc); extern int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty); extern void tty_ldisc_release(struct tty_struct *tty); extern int __must_check tty_ldisc_init(struct tty_struct *tty); extern void tty_ldisc_deinit(struct tty_struct *tty); extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p, char *f, int count); /* n_tty.c */ extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops); #ifdef CONFIG_TTY extern void __init n_tty_init(void); #else static inline void n_tty_init(void) { } #endif /* tty_audit.c */ #ifdef CONFIG_AUDIT extern void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size); extern void tty_audit_exit(void); extern void tty_audit_fork(struct signal_struct *sig); extern void tty_audit_tiocsti(struct tty_struct *tty, char ch); extern int tty_audit_push(void); #else static inline void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size) { } static inline void tty_audit_tiocsti(struct tty_struct *tty, char ch) { } static inline void tty_audit_exit(void) { } static inline void tty_audit_fork(struct signal_struct *sig) { } static inline int tty_audit_push(void) { return 0; } #endif /* tty_ioctl.c */ extern int n_tty_ioctl_helper(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); extern long n_tty_compat_ioctl_helper(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); /* vt.c */ extern int vt_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); extern long vt_compat_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg); /* tty_mutex.c */ /* functions for preparation of BKL removal */ extern void tty_lock(struct tty_struct *tty); extern int tty_lock_interruptible(struct tty_struct *tty); extern void tty_unlock(struct tty_struct *tty); extern void tty_lock_slave(struct tty_struct *tty); extern void tty_unlock_slave(struct tty_struct *tty); extern void tty_set_lock_subclass(struct tty_struct *tty); #ifdef CONFIG_PROC_FS extern void proc_tty_register_driver(struct tty_driver *); extern void proc_tty_unregister_driver(struct tty_driver *); #else static inline void proc_tty_register_driver(struct tty_driver *d) {} static inline void proc_tty_unregister_driver(struct tty_driver *d) {} #endif #define tty_msg(fn, tty, f, ...) \ fn("%s %s: " f, tty_driver_name(tty), tty_name(tty), ##__VA_ARGS__) #define tty_debug(tty, f, ...) tty_msg(pr_debug, tty, f, ##__VA_ARGS__) #define tty_info(tty, f, ...) tty_msg(pr_info, tty, f, ##__VA_ARGS__) #define tty_notice(tty, f, ...) tty_msg(pr_notice, tty, f, ##__VA_ARGS__) #define tty_warn(tty, f, ...) tty_msg(pr_warn, tty, f, ##__VA_ARGS__) #define tty_err(tty, f, ...) tty_msg(pr_err, tty, f, ##__VA_ARGS__) #define tty_info_ratelimited(tty, f, ...) \ tty_msg(pr_info_ratelimited, tty, f, ##__VA_ARGS__) #endif
148 146 146 90 91 91 91 1 90 90 165 165 41 2 41 41 41 41 44 44 44 44 44 129 129 1 44 44 150 129 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 * Phillip Lougher <phillip@squashfs.org.uk> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2, * or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * inode.c */ /* * This file implements code to create and read inodes from disk. * * Inodes in Squashfs are identified by a 48-bit inode which encodes the * location of the compressed metadata block containing the inode, and the byte * offset into that block where the inode is placed (<block, offset>). * * To maximise compression there are different inodes for each file type * (regular file, directory, device, etc.), the inode contents and length * varying with the type. * * To further maximise compression, two types of regular file inode and * directory inode are defined: inodes optimised for frequently occurring * regular files and directories, and extended types where extra * information has to be stored. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/xattr.h> #include <linux/pagemap.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs_fs_i.h" #include "squashfs.h" #include "xattr.h" /* * Initialise VFS inode with the base inode information common to all * Squashfs inode types. Sqsh_ino contains the unswapped base inode * off disk. */ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, struct squashfs_base_inode *sqsh_ino) { uid_t i_uid; gid_t i_gid; int err; err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &i_uid); if (err) return err; err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &i_gid); if (err) return err; i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); inode->i_atime.tv_sec = inode->i_mtime.tv_sec; inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; return err; } struct inode *squashfs_iget(struct super_block *sb, long long ino, unsigned int ino_number) { struct inode *inode = iget_locked(sb, ino_number); int err; TRACE("Entered squashfs_iget\n"); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; err = squashfs_read_inode(inode, ino); if (err) { iget_failed(inode); return ERR_PTR(err); } unlock_new_inode(inode); return inode; } /* * Initialise VFS inode by reading inode from inode table (compressed * metadata). The format and amount of data read depends on type. */ int squashfs_read_inode(struct inode *inode, long long ino) { struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; int err, type, offset = SQUASHFS_INODE_OFFSET(ino); union squashfs_inode squashfs_ino; struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; int xattr_id = SQUASHFS_INVALID_XATTR; TRACE("Entered squashfs_read_inode\n"); /* * Read inode base common to all inode types. */ err = squashfs_read_metadata(sb, sqshb_ino, &block, &offset, sizeof(*sqshb_ino)); if (err < 0) goto failed_read; err = squashfs_new_inode(sb, inode, sqshb_ino); if (err) goto failed_read; block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; offset = SQUASHFS_INODE_OFFSET(ino); type = le16_to_cpu(sqshb_ino->inode_type); switch (type) { case SQUASHFS_REG_TYPE: { unsigned int frag_offset, frag; int frag_size; u64 frag_blk; struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { err = frag_size; goto failed_read; } } else { frag_blk = SQUASHFS_INVALID_BLK; frag_size = 0; frag_offset = 0; } set_nlink(inode, 1); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; squashfs_i(inode)->fragment_offset = frag_offset; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, block, offset); break; } case SQUASHFS_LREG_TYPE: { unsigned int frag_offset, frag; int frag_size; u64 frag_blk; struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; frag = le32_to_cpu(sqsh_ino->fragment); if (frag != SQUASHFS_INVALID_FRAG) { frag_offset = le32_to_cpu(sqsh_ino->offset); frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); if (frag_size < 0) { err = frag_size; goto failed_read; } } else { frag_blk = SQUASHFS_INVALID_BLK; frag_size = 0; frag_offset = 0; } xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le64_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_inode_ops; inode->i_fop = &generic_ro_fops; inode->i_mode |= S_IFREG; inode->i_blocks = (inode->i_size - le64_to_cpu(sqsh_ino->sparse) + 511) >> 9; squashfs_i(inode)->fragment_block = frag_blk; squashfs_i(inode)->fragment_size = frag_size; squashfs_i(inode)->fragment_offset = frag_offset; squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->block_list_start = block; squashfs_i(inode)->offset = offset; inode->i_data.a_ops = &squashfs_aops; TRACE("File inode %x:%x, start_block %llx, block_list_start " "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, block, offset); break; } case SQUASHFS_DIR_TYPE: { struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le16_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; inode->i_mode |= S_IFDIR; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); squashfs_i(inode)->dir_idx_cnt = 0; squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); TRACE("Directory inode %x:%x, start_block %llx, offset %x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, le16_to_cpu(sqsh_ino->offset)); break; } case SQUASHFS_LDIR_TYPE: { struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; xattr_id = le32_to_cpu(sqsh_ino->xattr); set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->file_size); inode->i_op = &squashfs_dir_inode_ops; inode->i_fop = &squashfs_dir_ops; inode->i_mode |= S_IFDIR; squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); squashfs_i(inode)->dir_idx_start = block; squashfs_i(inode)->dir_idx_offset = offset; squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count); squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); TRACE("Long directory inode %x:%x, start_block %llx, offset " "%x\n", SQUASHFS_INODE_BLK(ino), offset, squashfs_i(inode)->start, le16_to_cpu(sqsh_ino->offset)); break; } case SQUASHFS_SYMLINK_TYPE: case SQUASHFS_LSYMLINK_TYPE: { struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); inode->i_op = &squashfs_symlink_inode_ops; inode_nohighmem(inode); inode->i_data.a_ops = &squashfs_symlink_aops; inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; squashfs_i(inode)->offset = offset; if (type == SQUASHFS_LSYMLINK_TYPE) { __le32 xattr; err = squashfs_read_metadata(sb, NULL, &block, &offset, inode->i_size); if (err < 0) goto failed_read; err = squashfs_read_metadata(sb, &xattr, &block, &offset, sizeof(xattr)); if (err < 0) goto failed_read; xattr_id = le32_to_cpu(xattr); } TRACE("Symbolic link inode %x:%x, start_block %llx, offset " "%x\n", SQUASHFS_INODE_BLK(ino), offset, block, offset); break; } case SQUASHFS_BLKDEV_TYPE: case SQUASHFS_CHRDEV_TYPE: { struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; unsigned int rdev; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_CHRDEV_TYPE) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); break; } case SQUASHFS_LBLKDEV_TYPE: case SQUASHFS_LCHRDEV_TYPE: { struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev; unsigned int rdev; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_LCHRDEV_TYPE) inode->i_mode |= S_IFCHR; else inode->i_mode |= S_IFBLK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); rdev = le32_to_cpu(sqsh_ino->rdev); init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); TRACE("Device inode %x:%x, rdev %x\n", SQUASHFS_INODE_BLK(ino), offset, rdev); break; } case SQUASHFS_FIFO_TYPE: case SQUASHFS_SOCKET_TYPE: { struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_FIFO_TYPE) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } case SQUASHFS_LFIFO_TYPE: case SQUASHFS_LSOCKET_TYPE: { struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc; err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, sizeof(*sqsh_ino)); if (err < 0) goto failed_read; if (type == SQUASHFS_LFIFO_TYPE) inode->i_mode |= S_IFIFO; else inode->i_mode |= S_IFSOCK; xattr_id = le32_to_cpu(sqsh_ino->xattr); inode->i_op = &squashfs_inode_ops; set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); init_special_inode(inode, inode->i_mode, 0); break; } default: ERROR("Unknown inode type %d in squashfs_iget!\n", type); return -EINVAL; } if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) { err = squashfs_xattr_lookup(sb, xattr_id, &squashfs_i(inode)->xattr_count, &squashfs_i(inode)->xattr_size, &squashfs_i(inode)->xattr); if (err < 0) goto failed_read; inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9) + 1; } else squashfs_i(inode)->xattr_count = 0; return 0; failed_read: ERROR("Unable to read inode 0x%llx\n", ino); return err; } const struct inode_operations squashfs_inode_ops = { .listxattr = squashfs_listxattr };
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 /* AFS cell and server record management * * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/key.h> #include <linux/ctype.h> #include <linux/dns_resolver.h> #include <linux/sched.h> #include <keys/rxrpc-type.h> #include "internal.h" DECLARE_RWSEM(afs_proc_cells_sem); LIST_HEAD(afs_proc_cells); static LIST_HEAD(afs_cells); static DEFINE_RWLOCK(afs_cells_lock); static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */ static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq); static struct afs_cell *afs_cell_root; /* * allocate a cell record and fill in its name, VL server address list and * allocate an anonymous key */ static struct afs_cell *afs_cell_alloc(const char *name, unsigned namelen, char *vllist) { struct afs_cell *cell; struct key *key; char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; char *dvllist = NULL, *_vllist = NULL; char delimiter = ':'; int ret; _enter("%*.*s,%s", namelen, namelen, name ?: "", vllist); BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ if (namelen > AFS_MAXCELLNAME) { _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } /* allocate and initialise a cell record */ cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); if (!cell) { _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } memcpy(cell->name, name, namelen); cell->name[namelen] = 0; atomic_set(&cell->usage, 1); INIT_LIST_HEAD(&cell->link); rwlock_init(&cell->servers_lock); INIT_LIST_HEAD(&cell->servers); init_rwsem(&cell->vl_sem); INIT_LIST_HEAD(&cell->vl_list); spin_lock_init(&cell->vl_lock); /* if the ip address is invalid, try dns query */ if (!vllist || strlen(vllist) < 7) { ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); if (ret < 0) { if (ret == -ENODATA || ret == -EAGAIN || ret == -ENOKEY) /* translate these errors into something * userspace might understand */ ret = -EDESTADDRREQ; _leave(" = %d", ret); return ERR_PTR(ret); } _vllist = dvllist; /* change the delimiter for user-space reply */ delimiter = ','; } else { _vllist = vllist; } /* fill in the VL server list from the rest of the string */ do { unsigned a, b, c, d; next = strchr(_vllist, delimiter); if (next) *next++ = 0; if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) goto bad_address; if (a > 255 || b > 255 || c > 255 || d > 255) goto bad_address; cell->vl_addrs[cell->vl_naddrs++].s_addr = htonl((a << 24) | (b << 16) | (c << 8) | d); } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next)); /* create a key to represent an anonymous user */ memcpy(keyname, "afs@", 4); dp = keyname + 4; cp = cell->name; do { *dp++ = toupper(*cp); } while (*cp++); key = rxrpc_get_null_key(keyname); if (IS_ERR(key)) { _debug("no key"); ret = PTR_ERR(key); goto error; } cell->anonymous_key = key; _debug("anon key %p{%x}", cell->anonymous_key, key_serial(cell->anonymous_key)); _leave(" = %p", cell); return cell; bad_address: printk(KERN_ERR "kAFS: bad VL server IP address\n"); ret = -EINVAL; error: key_put(cell->anonymous_key); kfree(dvllist); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); } /* * afs_cell_crate() - create a cell record * @name: is the name of the cell. * @namsesz: is the strlen of the cell name. * @vllist: is a colon separated list of IP addresses in "a.b.c.d" format. * @retref: is T to return the cell reference when the cell exists. */ struct afs_cell *afs_cell_create(const char *name, unsigned namesz, char *vllist, bool retref) { struct afs_cell *cell; int ret; _enter("%*.*s,%s", namesz, namesz, name ?: "", vllist); down_write(&afs_cells_sem); read_lock(&afs_cells_lock); list_for_each_entry(cell, &afs_cells, link) { if (strncasecmp(cell->name, name, namesz) == 0) goto duplicate_name; } read_unlock(&afs_cells_lock); cell = afs_cell_alloc(name, namesz, vllist); if (IS_ERR(cell)) { _leave(" = %ld", PTR_ERR(cell)); up_write(&afs_cells_sem); return cell; } /* add a proc directory for this cell */ ret = afs_proc_cell_setup(cell); if (ret < 0) goto error; #ifdef CONFIG_AFS_FSCACHE /* put it up for caching (this never returns an error) */ cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, &afs_cell_cache_index_def, cell, true); #endif /* add to the cell lists */ write_lock(&afs_cells_lock); list_add_tail(&cell->link, &afs_cells); write_unlock(&afs_cells_lock); down_write(&afs_proc_cells_sem); list_add_tail(&cell->proc_link, &afs_proc_cells); up_write(&afs_proc_cells_sem); up_write(&afs_cells_sem); _leave(" = %p", cell); return cell; error: up_write(&afs_cells_sem); key_put(cell->anonymous_key); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); duplicate_name: if (retref && !IS_ERR(cell)) afs_get_cell(cell); read_unlock(&afs_cells_lock); up_write(&afs_cells_sem); if (retref) { _leave(" = %p", cell); return cell; } _leave(" = -EEXIST"); return ERR_PTR(-EEXIST); } /* * set the root cell information * - can be called with a module parameter string * - can be called from a write to /proc/fs/afs/rootcell */ int afs_cell_init(char *rootcell) { struct afs_cell *old_root, *new_root; char *cp; _enter(""); if (!rootcell) { /* module is loaded with no parameters, or built statically. * - in the future we might initialize cell DB here. */ _leave(" = 0 [no root]"); return 0; } cp = strchr(rootcell, ':'); if (!cp) _debug("kAFS: no VL server IP addresses specified"); else *cp++ = 0; /* allocate a cell record for the root cell */ new_root = afs_cell_create(rootcell, strlen(rootcell), cp, false); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); return PTR_ERR(new_root); } /* install the new cell */ write_lock(&afs_cells_lock); old_root = afs_cell_root; afs_cell_root = new_root; write_unlock(&afs_cells_lock); afs_put_cell(old_root); _leave(" = 0"); return 0; } /* * lookup a cell record */ struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz, bool dns_cell) { struct afs_cell *cell; _enter("\"%*.*s\",", namesz, namesz, name ?: ""); down_read(&afs_cells_sem); read_lock(&afs_cells_lock); if (name) { /* if the cell was named, look for it in the cell record list */ list_for_each_entry(cell, &afs_cells, link) { if (strncmp(cell->name, name, namesz) == 0) { afs_get_cell(cell); goto found; } } cell = ERR_PTR(-ENOENT); if (dns_cell) goto create_cell; found: ; } else { cell = afs_cell_root; if (!cell) { /* this should not happen unless user tries to mount * when root cell is not set. Return an impossibly * bizarre errno to alert the user. Things like * ENOENT might be "more appropriate" but they happen * for other reasons. */ cell = ERR_PTR(-EDESTADDRREQ); } else { afs_get_cell(cell); } } read_unlock(&afs_cells_lock); up_read(&afs_cells_sem); _leave(" = %p", cell); return cell; create_cell: read_unlock(&afs_cells_lock); up_read(&afs_cells_sem); cell = afs_cell_create(name, namesz, NULL, true); _leave(" = %p", cell); return cell; } #if 0 /* * try and get a cell record */ struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell) { write_lock(&afs_cells_lock); if (cell && !list_empty(&cell->link)) afs_get_cell(cell); else cell = NULL; write_unlock(&afs_cells_lock); return cell; } #endif /* 0 */ /* * destroy a cell record */ void afs_put_cell(struct afs_cell *cell) { if (!cell) return; _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); ASSERTCMP(atomic_read(&cell->usage), >, 0); /* to prevent a race, the decrement and the dequeue must be effectively * atomic */ write_lock(&afs_cells_lock); if (likely(!atomic_dec_and_test(&cell->usage))) { write_unlock(&afs_cells_lock); _leave(""); return; } ASSERT(list_empty(&cell->servers)); ASSERT(list_empty(&cell->vl_list)); write_unlock(&afs_cells_lock); wake_up(&afs_cells_freeable_wq); _leave(" [unused]"); } /* * destroy a cell record * - must be called with the afs_cells_sem write-locked * - cell->link should have been broken by the caller */ static void afs_cell_destroy(struct afs_cell *cell) { _enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name); ASSERTCMP(atomic_read(&cell->usage), >=, 0); ASSERT(list_empty(&cell->link)); /* wait for everyone to stop using the cell */ if (atomic_read(&cell->usage) > 0) { DECLARE_WAITQUEUE(myself, current); _debug("wait for cell %s", cell->name); set_current_state(TASK_UNINTERRUPTIBLE); add_wait_queue(&afs_cells_freeable_wq, &myself); while (atomic_read(&cell->usage) > 0) { schedule(); set_current_state(TASK_UNINTERRUPTIBLE); } remove_wait_queue(&afs_cells_freeable_wq, &myself); set_current_state(TASK_RUNNING); } _debug("cell dead"); ASSERTCMP(atomic_read(&cell->usage), ==, 0); ASSERT(list_empty(&cell->servers)); ASSERT(list_empty(&cell->vl_list)); afs_proc_cell_remove(cell); down_write(&afs_proc_cells_sem); list_del_init(&cell->proc_link); up_write(&afs_proc_cells_sem); #ifdef CONFIG_AFS_FSCACHE fscache_relinquish_cookie(cell->cache, 0); #endif key_put(cell->anonymous_key); kfree(cell); _leave(" [destroyed]"); } /* * purge in-memory cell database on module unload or afs_init() failure * - the timeout daemon is stopped before calling this */ void afs_cell_purge(void) { struct afs_cell *cell; _enter(""); afs_put_cell(afs_cell_root); down_write(&afs_cells_sem); while (!list_empty(&afs_cells)) { cell = NULL; /* remove the next cell from the front of the list */ write_lock(&afs_cells_lock); if (!list_empty(&afs_cells)) { cell = list_entry(afs_cells.next, struct afs_cell, link); list_del_init(&cell->link); } write_unlock(&afs_cells_lock); if (cell) { _debug("PURGING CELL %s (%d)", cell->name, atomic_read(&cell->usage)); /* now the cell should be left with no references */ afs_cell_destroy(cell); } } up_write(&afs_cells_sem); _leave(""); }
208 208 208 74 74 74 179 180 2 2 2 2 38 38 64 61 64 12 64 11 64 15 2550 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 /* * Cryptographic API for algorithms (i.e., low-level API). * * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include <linux/err.h> #include <linux/errno.h> #include <linux/fips.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/string.h> #include "internal.h" static LIST_HEAD(crypto_template_list); static inline int crypto_set_driver_name(struct crypto_alg *alg) { static const char suffix[] = "-generic"; char *driver_name = alg->cra_driver_name; int len; if (*driver_name) return 0; len = strlcpy(driver_name, alg->cra_name, CRYPTO_MAX_ALG_NAME); if (len + sizeof(suffix) > CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; memcpy(driver_name + len, suffix, sizeof(suffix)); return 0; } static inline void crypto_check_module_sig(struct module *mod) { if (fips_enabled && mod && !module_sig_ok(mod)) panic("Module %s signature verification failed in FIPS mode\n", module_name(mod)); } static int crypto_check_alg(struct crypto_alg *alg) { crypto_check_module_sig(alg->cra_module); if (alg->cra_alignmask & (alg->cra_alignmask + 1)) return -EINVAL; if (alg->cra_blocksize > PAGE_SIZE / 8) return -EINVAL; if (alg->cra_priority < 0) return -EINVAL; atomic_set(&alg->cra_refcnt, 1); return crypto_set_driver_name(alg); } static void crypto_free_instance(struct crypto_instance *inst) { if (!inst->alg.cra_type->free) { inst->tmpl->free(inst); return; } inst->alg.cra_type->free(inst); } static void crypto_destroy_instance(struct crypto_alg *alg) { struct crypto_instance *inst = (void *)alg; struct crypto_template *tmpl = inst->tmpl; crypto_free_instance(inst); crypto_tmpl_put(tmpl); } static struct list_head *crypto_more_spawns(struct crypto_alg *alg, struct list_head *stack, struct list_head *top, struct list_head *secondary_spawns) { struct crypto_spawn *spawn, *n; spawn = list_first_entry_or_null(stack, struct crypto_spawn, list); if (!spawn) return NULL; n = list_next_entry(spawn, list); if (spawn->alg && &n->list != stack && !n->alg) n->alg = (n->list.next == stack) ? alg : &list_next_entry(n, list)->inst->alg; list_move(&spawn->list, secondary_spawns); return &n->list == stack ? top : &n->inst->alg.cra_users; } static void crypto_remove_instance(struct crypto_instance *inst, struct list_head *list) { struct crypto_template *tmpl = inst->tmpl; if (crypto_is_dead(&inst->alg)) return; inst->alg.cra_flags |= CRYPTO_ALG_DEAD; if (hlist_unhashed(&inst->list)) return; if (!tmpl || !crypto_tmpl_get(tmpl)) return; crypto_notify(CRYPTO_MSG_ALG_UNREGISTER, &inst->alg); list_move(&inst->alg.cra_list, list); hlist_del(&inst->list); inst->alg.cra_destroy = crypto_destroy_instance; BUG_ON(!list_empty(&inst->alg.cra_users)); } void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, struct crypto_alg *nalg) { u32 new_type = (nalg ?: alg)->cra_flags; struct crypto_spawn *spawn, *n; LIST_HEAD(secondary_spawns); struct list_head *spawns; LIST_HEAD(stack); LIST_HEAD(top); spawns = &alg->cra_users; list_for_each_entry_safe(spawn, n, spawns, list) { if ((spawn->alg->cra_flags ^ new_type) & spawn->mask) continue; list_move(&spawn->list, &top); } spawns = &top; do { while (!list_empty(spawns)) { struct crypto_instance *inst; spawn = list_first_entry(spawns, struct crypto_spawn, list); inst = spawn->inst; BUG_ON(&inst->alg == alg); list_move(&spawn->list, &stack); if (&inst->alg == nalg) break; spawn->alg = NULL; spawns = &inst->alg.cra_users; /* * We may encounter an unregistered instance here, since * an instance's spawns are set up prior to the instance * being registered. An unregistered instance will have * NULL ->cra_users.next, since ->cra_users isn't * properly initialized until registration. But an * unregistered instance cannot have any users, so treat * it the same as ->cra_users being empty. */ if (spawns->next == NULL) break; } } while ((spawns = crypto_more_spawns(alg, &stack, &top, &secondary_spawns))); list_for_each_entry_safe(spawn, n, &secondary_spawns, list) { if (spawn->alg) list_move(&spawn->list, &spawn->alg->cra_users); else crypto_remove_instance(spawn->inst, list); } } EXPORT_SYMBOL_GPL(crypto_remove_spawns); static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) { struct crypto_alg *q; struct crypto_larval *larval; int ret = -EAGAIN; if (crypto_is_dead(alg)) goto err; INIT_LIST_HEAD(&alg->cra_users); /* No cheating! */ alg->cra_flags &= ~CRYPTO_ALG_TESTED; ret = -EEXIST; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (q == alg) goto err; if (crypto_is_moribund(q)) continue; if (crypto_is_larval(q)) { if (!strcmp(alg->cra_driver_name, q->cra_driver_name)) goto err; continue; } if (!strcmp(q->cra_driver_name, alg->cra_name) || !strcmp(q->cra_name, alg->cra_driver_name)) goto err; } larval = crypto_larval_alloc(alg->cra_name, alg->cra_flags | CRYPTO_ALG_TESTED, 0); if (IS_ERR(larval)) goto out; ret = -ENOENT; larval->adult = crypto_mod_get(alg); if (!larval->adult) goto free_larval; atomic_set(&larval->alg.cra_refcnt, 1); memcpy(larval->alg.cra_driver_name, alg->cra_driver_name, CRYPTO_MAX_ALG_NAME); larval->alg.cra_priority = alg->cra_priority; list_add(&alg->cra_list, &crypto_alg_list); list_add(&larval->alg.cra_list, &crypto_alg_list); out: return larval; free_larval: kfree(larval); err: larval = ERR_PTR(ret); goto out; } void crypto_alg_tested(const char *name, int err) { struct crypto_larval *test; struct crypto_alg *alg; struct crypto_alg *q; LIST_HEAD(list); down_write(&crypto_alg_sem); list_for_each_entry(q, &crypto_alg_list, cra_list) { if (crypto_is_moribund(q) || !crypto_is_larval(q)) continue; test = (struct crypto_larval *)q; if (!strcmp(q->cra_driver_name, name)) goto found; } pr_err("alg: Unexpected test result for %s: %d\n", name, err); goto unlock; found: q->cra_flags |= CRYPTO_ALG_DEAD; alg = test->adult; if (err || list_empty(&alg->cra_list)) goto complete; alg->cra_flags |= CRYPTO_ALG_TESTED; list_for_each_entry(q, &crypto_alg_list, cra_list) { if (q == alg) continue; if (crypto_is_moribund(q)) continue; if (crypto_is_larval(q)) { struct crypto_larval *larval = (void *)q; /* * Check to see if either our generic name or * specific name can satisfy the name requested * by the larval entry q. */ if (strcmp(alg->cra_name, q->cra_name) && strcmp(alg->cra_driver_name, q->cra_name)) continue; if (larval->adult) continue; if ((q->cra_flags ^ alg->cra_flags) & larval->mask) continue; if (!crypto_mod_get(alg)) continue; larval->adult = alg; continue; } if (strcmp(alg->cra_name, q->cra_name)) continue; if (strcmp(alg->cra_driver_name, q->cra_driver_name) && q->cra_priority > alg->cra_priority) continue; crypto_remove_spawns(q, &list, alg); } complete: complete_all(&test->completion); unlock: up_write(&crypto_alg_sem); crypto_remove_final(&list); } EXPORT_SYMBOL_GPL(crypto_alg_tested); void crypto_remove_final(struct list_head *list) { struct crypto_alg *alg; struct crypto_alg *n; list_for_each_entry_safe(alg, n, list, cra_list) { list_del_init(&alg->cra_list); crypto_alg_put(alg); } } EXPORT_SYMBOL_GPL(crypto_remove_final); static void crypto_wait_for_test(struct crypto_larval *larval) { int err; err = crypto_probing_notify(CRYPTO_MSG_ALG_REGISTER, larval->adult); if (err != NOTIFY_STOP) { if (WARN_ON(err != NOTIFY_DONE)) goto out; crypto_alg_tested(larval->alg.cra_driver_name, 0); } err = wait_for_completion_killable(&larval->completion); WARN_ON(err); out: crypto_larval_kill(&larval->alg); } int crypto_register_alg(struct crypto_alg *alg) { struct crypto_larval *larval; int err; alg->cra_flags &= ~CRYPTO_ALG_DEAD; err = crypto_check_alg(alg); if (err) return err; down_write(&crypto_alg_sem); larval = __crypto_register_alg(alg); up_write(&crypto_alg_sem); if (IS_ERR(larval)) return PTR_ERR(larval); crypto_wait_for_test(larval); return 0; } EXPORT_SYMBOL_GPL(crypto_register_alg); static int crypto_remove_alg(struct crypto_alg *alg, struct list_head *list) { if (unlikely(list_empty(&alg->cra_list))) return -ENOENT; alg->cra_flags |= CRYPTO_ALG_DEAD; crypto_notify(CRYPTO_MSG_ALG_UNREGISTER, alg); list_del_init(&alg->cra_list); crypto_remove_spawns(alg, list, NULL); return 0; } int crypto_unregister_alg(struct crypto_alg *alg) { int ret; LIST_HEAD(list); down_write(&crypto_alg_sem); ret = crypto_remove_alg(alg, &list); up_write(&crypto_alg_sem); if (ret) return ret; BUG_ON(atomic_read(&alg->cra_refcnt) != 1); if (alg->cra_destroy) alg->cra_destroy(alg); crypto_remove_final(&list); return 0; } EXPORT_SYMBOL_GPL(crypto_unregister_alg); int crypto_register_algs(struct crypto_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_alg(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_alg(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_algs); int crypto_unregister_algs(struct crypto_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_unregister_alg(&algs[i]); if (ret) pr_err("Failed to unregister %s %s: %d\n", algs[i].cra_driver_name, algs[i].cra_name, ret); } return 0; } EXPORT_SYMBOL_GPL(crypto_unregister_algs); int crypto_register_template(struct crypto_template *tmpl) { struct crypto_template *q; int err = -EEXIST; down_write(&crypto_alg_sem); crypto_check_module_sig(tmpl->module); list_for_each_entry(q, &crypto_template_list, list) { if (q == tmpl) goto out; } list_add(&tmpl->list, &crypto_template_list); crypto_notify(CRYPTO_MSG_TMPL_REGISTER, tmpl); err = 0; out: up_write(&crypto_alg_sem); return err; } EXPORT_SYMBOL_GPL(crypto_register_template); void crypto_unregister_template(struct crypto_template *tmpl) { struct crypto_instance *inst; struct hlist_node *n; struct hlist_head *list; LIST_HEAD(users); down_write(&crypto_alg_sem); BUG_ON(list_empty(&tmpl->list)); list_del_init(&tmpl->list); list = &tmpl->instances; hlist_for_each_entry(inst, list, list) { int err = crypto_remove_alg(&inst->alg, &users); BUG_ON(err); } crypto_notify(CRYPTO_MSG_TMPL_UNREGISTER, tmpl); up_write(&crypto_alg_sem); hlist_for_each_entry_safe(inst, n, list, list) { BUG_ON(atomic_read(&inst->alg.cra_refcnt) != 1); crypto_free_instance(inst); } crypto_remove_final(&users); } EXPORT_SYMBOL_GPL(crypto_unregister_template); static struct crypto_template *__crypto_lookup_template(const char *name) { struct crypto_template *q, *tmpl = NULL; down_read(&crypto_alg_sem); list_for_each_entry(q, &crypto_template_list, list) { if (strcmp(q->name, name)) continue; if (unlikely(!crypto_tmpl_get(q))) continue; tmpl = q; break; } up_read(&crypto_alg_sem); return tmpl; } struct crypto_template *crypto_lookup_template(const char *name) { return try_then_request_module(__crypto_lookup_template(name), "crypto-%s", name); } EXPORT_SYMBOL_GPL(crypto_lookup_template); int crypto_register_instance(struct crypto_template *tmpl, struct crypto_instance *inst) { struct crypto_larval *larval; int err; err = crypto_check_alg(&inst->alg); if (err) return err; inst->alg.cra_module = tmpl->module; inst->alg.cra_flags |= CRYPTO_ALG_INSTANCE; if (unlikely(!crypto_mod_get(&inst->alg))) return -EAGAIN; down_write(&crypto_alg_sem); larval = __crypto_register_alg(&inst->alg); if (IS_ERR(larval)) goto unlock; hlist_add_head(&inst->list, &tmpl->instances); inst->tmpl = tmpl; unlock: up_write(&crypto_alg_sem); err = PTR_ERR(larval); if (IS_ERR(larval)) goto err; crypto_wait_for_test(larval); /* Remove instance if test failed */ if (!(inst->alg.cra_flags & CRYPTO_ALG_TESTED)) crypto_unregister_instance(inst); err = 0; err: crypto_mod_put(&inst->alg); return err; } EXPORT_SYMBOL_GPL(crypto_register_instance); int crypto_unregister_instance(struct crypto_instance *inst) { LIST_HEAD(list); down_write(&crypto_alg_sem); crypto_remove_spawns(&inst->alg, &list, NULL); crypto_remove_instance(inst, &list); up_write(&crypto_alg_sem); crypto_remove_final(&list); return 0; } EXPORT_SYMBOL_GPL(crypto_unregister_instance); int crypto_init_spawn(struct crypto_spawn *spawn, struct crypto_alg *alg, struct crypto_instance *inst, u32 mask) { int err = -EAGAIN; spawn->inst = inst; spawn->mask = mask; down_write(&crypto_alg_sem); if (!crypto_is_moribund(alg)) { list_add(&spawn->list, &alg->cra_users); spawn->alg = alg; err = 0; } up_write(&crypto_alg_sem); return err; } EXPORT_SYMBOL_GPL(crypto_init_spawn); int crypto_init_spawn2(struct crypto_spawn *spawn, struct crypto_alg *alg, struct crypto_instance *inst, const struct crypto_type *frontend) { int err = -EINVAL; if ((alg->cra_flags ^ frontend->type) & frontend->maskset) goto out; spawn->frontend = frontend; err = crypto_init_spawn(spawn, alg, inst, frontend->maskset); out: return err; } EXPORT_SYMBOL_GPL(crypto_init_spawn2); int crypto_grab_spawn(struct crypto_spawn *spawn, const char *name, u32 type, u32 mask) { struct crypto_alg *alg; int err; alg = crypto_find_alg(name, spawn->frontend, type, mask); if (IS_ERR(alg)) return PTR_ERR(alg); err = crypto_init_spawn(spawn, alg, spawn->inst, mask); crypto_mod_put(alg); return err; } EXPORT_SYMBOL_GPL(crypto_grab_spawn); void crypto_drop_spawn(struct crypto_spawn *spawn) { down_write(&crypto_alg_sem); if (spawn->alg) list_del(&spawn->list); up_write(&crypto_alg_sem); } EXPORT_SYMBOL_GPL(crypto_drop_spawn); static struct crypto_alg *crypto_spawn_alg(struct crypto_spawn *spawn) { struct crypto_alg *alg; down_read(&crypto_alg_sem); alg = spawn->alg; if (alg && !crypto_mod_get(alg)) { alg->cra_flags |= CRYPTO_ALG_DYING; alg = NULL; } up_read(&crypto_alg_sem); return alg ?: ERR_PTR(-EAGAIN); } struct crypto_tfm *crypto_spawn_tfm(struct crypto_spawn *spawn, u32 type, u32 mask) { struct crypto_alg *alg; struct crypto_tfm *tfm; alg = crypto_spawn_alg(spawn); if (IS_ERR(alg)) return ERR_CAST(alg); tfm = ERR_PTR(-EINVAL); if (unlikely((alg->cra_flags ^ type) & mask)) goto out_put_alg; tfm = __crypto_alloc_tfm(alg, type, mask); if (IS_ERR(tfm)) goto out_put_alg; return tfm; out_put_alg: crypto_mod_put(alg); return tfm; } EXPORT_SYMBOL_GPL(crypto_spawn_tfm); void *crypto_spawn_tfm2(struct crypto_spawn *spawn) { struct crypto_alg *alg; struct crypto_tfm *tfm; alg = crypto_spawn_alg(spawn); if (IS_ERR(alg)) return ERR_CAST(alg); tfm = crypto_create_tfm(alg, spawn->frontend); if (IS_ERR(tfm)) goto out_put_alg; return tfm; out_put_alg: crypto_mod_put(alg); return tfm; } EXPORT_SYMBOL_GPL(crypto_spawn_tfm2); int crypto_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&crypto_chain, nb); } EXPORT_SYMBOL_GPL(crypto_register_notifier); int crypto_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&crypto_chain, nb); } EXPORT_SYMBOL_GPL(crypto_unregister_notifier); struct crypto_attr_type *crypto_get_attr_type(struct rtattr **tb) { struct rtattr *rta = tb[0]; struct crypto_attr_type *algt; if (!rta) return ERR_PTR(-ENOENT); if (RTA_PAYLOAD(rta) < sizeof(*algt)) return ERR_PTR(-EINVAL); if (rta->rta_type != CRYPTOA_TYPE) return ERR_PTR(-EINVAL); algt = RTA_DATA(rta); return algt; } EXPORT_SYMBOL_GPL(crypto_get_attr_type); int crypto_check_attr_type(struct rtattr **tb, u32 type) { struct crypto_attr_type *algt; algt = crypto_get_attr_type(tb); if (IS_ERR(algt)) return PTR_ERR(algt); if ((algt->type ^ type) & algt->mask) return -EINVAL; return 0; } EXPORT_SYMBOL_GPL(crypto_check_attr_type); const char *crypto_attr_alg_name(struct rtattr *rta) { struct crypto_attr_alg *alga; if (!rta) return ERR_PTR(-ENOENT); if (RTA_PAYLOAD(rta) < sizeof(*alga)) return ERR_PTR(-EINVAL); if (rta->rta_type != CRYPTOA_ALG) return ERR_PTR(-EINVAL); alga = RTA_DATA(rta); alga->name[CRYPTO_MAX_ALG_NAME - 1] = 0; return alga->name; } EXPORT_SYMBOL_GPL(crypto_attr_alg_name); struct crypto_alg *crypto_attr_alg2(struct rtattr *rta, const struct crypto_type *frontend, u32 type, u32 mask) { const char *name; name = crypto_attr_alg_name(rta); if (IS_ERR(name)) return ERR_CAST(name); return crypto_find_alg(name, frontend, type, mask); } EXPORT_SYMBOL_GPL(crypto_attr_alg2); int crypto_attr_u32(struct rtattr *rta, u32 *num) { struct crypto_attr_u32 *nu32; if (!rta) return -ENOENT; if (RTA_PAYLOAD(rta) < sizeof(*nu32)) return -EINVAL; if (rta->rta_type != CRYPTOA_U32) return -EINVAL; nu32 = RTA_DATA(rta); *num = nu32->num; return 0; } EXPORT_SYMBOL_GPL(crypto_attr_u32); int crypto_inst_setname(struct crypto_instance *inst, const char *name, struct crypto_alg *alg) { if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name, alg->cra_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", name, alg->cra_driver_name) >= CRYPTO_MAX_ALG_NAME) return -ENAMETOOLONG; return 0; } EXPORT_SYMBOL_GPL(crypto_inst_setname); void *crypto_alloc_instance2(const char *name, struct crypto_alg *alg, unsigned int head) { struct crypto_instance *inst; char *p; int err; p = kzalloc(head + sizeof(*inst) + sizeof(struct crypto_spawn), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); inst = (void *)(p + head); err = crypto_inst_setname(inst, name, alg); if (err) goto err_free_inst; return p; err_free_inst: kfree(p); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_alloc_instance2); struct crypto_instance *crypto_alloc_instance(const char *name, struct crypto_alg *alg) { struct crypto_instance *inst; struct crypto_spawn *spawn; int err; inst = crypto_alloc_instance2(name, alg, 0); if (IS_ERR(inst)) goto out; spawn = crypto_instance_ctx(inst); err = crypto_init_spawn(spawn, alg, inst, CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_ASYNC); if (err) goto err_free_inst; return inst; err_free_inst: kfree(inst); inst = ERR_PTR(err); out: return inst; } EXPORT_SYMBOL_GPL(crypto_alloc_instance); void crypto_init_queue(struct crypto_queue *queue, unsigned int max_qlen) { INIT_LIST_HEAD(&queue->list); queue->backlog = &queue->list; queue->qlen = 0; queue->max_qlen = max_qlen; } EXPORT_SYMBOL_GPL(crypto_init_queue); int crypto_enqueue_request(struct crypto_queue *queue, struct crypto_async_request *request) { int err = -EINPROGRESS; if (unlikely(queue->qlen >= queue->max_qlen)) { err = -EBUSY; if (!(request->flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) goto out; if (queue->backlog == &queue->list) queue->backlog = &request->list; } queue->qlen++; list_add_tail(&request->list, &queue->list); out: return err; } EXPORT_SYMBOL_GPL(crypto_enqueue_request); struct crypto_async_request *crypto_dequeue_request(struct crypto_queue *queue) { struct list_head *request; if (unlikely(!queue->qlen)) return NULL; queue->qlen--; if (queue->backlog != &queue->list) queue->backlog = queue->backlog->next; request = queue->list.next; list_del(request); return list_entry(request, struct crypto_async_request, list); } EXPORT_SYMBOL_GPL(crypto_dequeue_request); int crypto_tfm_in_queue(struct crypto_queue *queue, struct crypto_tfm *tfm) { struct crypto_async_request *req; list_for_each_entry(req, &queue->list, list) { if (req->tfm == tfm) return 1; } return 0; } EXPORT_SYMBOL_GPL(crypto_tfm_in_queue); static inline void crypto_inc_byte(u8 *a, unsigned int size) { u8 *b = (a + size); u8 c; for (; size; size--) { c = *--b + 1; *b = c; if (c) break; } } void crypto_inc(u8 *a, unsigned int size) { __be32 *b = (__be32 *)(a + size); u32 c; if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || IS_ALIGNED((unsigned long)b, __alignof__(*b))) for (; size >= 4; size -= 4) { c = be32_to_cpu(*--b) + 1; *b = cpu_to_be32(c); if (likely(c)) return; } crypto_inc_byte(a, size); } EXPORT_SYMBOL_GPL(crypto_inc); void __crypto_xor(u8 *dst, const u8 *src1, const u8 *src2, unsigned int len) { int relalign = 0; if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { int size = sizeof(unsigned long); int d = (((unsigned long)dst ^ (unsigned long)src1) | ((unsigned long)dst ^ (unsigned long)src2)) & (size - 1); relalign = d ? 1 << __ffs(d) : size; /* * If we care about alignment, process as many bytes as * needed to advance dst and src to values whose alignments * equal their relative alignment. This will allow us to * process the remainder of the input using optimal strides. */ while (((unsigned long)dst & (relalign - 1)) && len > 0) { *dst++ = *src1++ ^ *src2++; len--; } } while (IS_ENABLED(CONFIG_64BIT) && len >= 8 && !(relalign & 7)) { *(u64 *)dst = *(u64 *)src1 ^ *(u64 *)src2; dst += 8; src1 += 8; src2 += 8; len -= 8; } while (len >= 4 && !(relalign & 3)) { *(u32 *)dst = *(u32 *)src1 ^ *(u32 *)src2; dst += 4; src1 += 4; src2 += 4; len -= 4; } while (len >= 2 && !(relalign & 1)) { *(u16 *)dst = *(u16 *)src1 ^ *(u16 *)src2; dst += 2; src1 += 2; src2 += 2; len -= 2; } while (len--) *dst++ = *src1++ ^ *src2++; } EXPORT_SYMBOL_GPL(__crypto_xor); unsigned int crypto_alg_extsize(struct crypto_alg *alg) { return alg->cra_ctxsize + (alg->cra_alignmask & ~(crypto_tfm_ctx_alignment() - 1)); } EXPORT_SYMBOL_GPL(crypto_alg_extsize); int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, u32 type, u32 mask) { int ret = 0; struct crypto_alg *alg = crypto_find_alg(name, frontend, type, mask); if (!IS_ERR(alg)) { crypto_mod_put(alg); ret = 1; } return ret; } EXPORT_SYMBOL_GPL(crypto_type_has_alg); static int __init crypto_algapi_init(void) { crypto_init_proc(); return 0; } static void __exit crypto_algapi_exit(void) { crypto_exit_proc(); } module_init(crypto_algapi_init); module_exit(crypto_algapi_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Cryptographic algorithms API");
66 66 178 178 66 65 66 66 66 178 178 178 66 35 35 35 35 35 35 4500 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 /* * Universal/legacy driver for 8250/16550-type serial ports * * Based on drivers/char/serial.c, by Linus Torvalds, Theodore Ts'o. * * Copyright (C) 2001 Russell King. * * Supports: ISA-compatible 8250/16550 ports * PNP 8250/16550 ports * early_serial_setup() ports * userspace-configurable "phantom" ports * "serial8250" platform devices * serial8250_register_8250_port() ports * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/console.h> #include <linux/sysrq.h> #include <linux/delay.h> #include <linux/platform_device.h> #include <linux/tty.h> #include <linux/ratelimit.h> #include <linux/tty_flip.h> #include <linux/serial.h> #include <linux/serial_8250.h> #include <linux/nmi.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> #include <linux/io.h> #ifdef CONFIG_SPARC #include <linux/sunserialcore.h> #endif #include <asm/irq.h> #include "8250.h" /* * Configuration: * share_irqs - whether we pass IRQF_SHARED to request_irq(). This option * is unsafe when used on edge-triggered interrupts. */ static unsigned int share_irqs = SERIAL8250_SHARE_IRQS; static unsigned int nr_uarts = CONFIG_SERIAL_8250_RUNTIME_UARTS; static struct uart_driver serial8250_reg; static unsigned int skip_txen_test; /* force skip of txen test at init time */ #define PASS_LIMIT 512 #include <asm/serial.h> /* * SERIAL_PORT_DFNS tells us about built-in ports that have no * standard enumeration mechanism. Platforms that can find all * serial ports via mechanisms like ACPI or PCI need not supply it. */ #ifndef SERIAL_PORT_DFNS #define SERIAL_PORT_DFNS #endif static const struct old_serial_port old_serial_port[] = { SERIAL_PORT_DFNS /* defined in asm/serial.h */ }; #define UART_NR CONFIG_SERIAL_8250_NR_UARTS #ifdef CONFIG_SERIAL_8250_RSA #define PORT_RSA_MAX 4 static unsigned long probe_rsa[PORT_RSA_MAX]; static unsigned int probe_rsa_count; #endif /* CONFIG_SERIAL_8250_RSA */ struct irq_info { struct hlist_node node; int irq; spinlock_t lock; /* Protects list not the hash */ struct list_head *head; }; #define NR_IRQ_HASH 32 /* Can be adjusted later */ static struct hlist_head irq_lists[NR_IRQ_HASH]; static DEFINE_MUTEX(hash_mutex); /* Used to walk the hash */ /* * This is the serial driver's interrupt routine. * * Arjan thinks the old way was overly complex, so it got simplified. * Alan disagrees, saying that need the complexity to handle the weird * nature of ISA shared interrupts. (This is a special exception.) * * In order to handle ISA shared interrupts properly, we need to check * that all ports have been serviced, and therefore the ISA interrupt * line has been de-asserted. * * This means we need to loop through all ports. checking that they * don't have an interrupt pending. */ static irqreturn_t serial8250_interrupt(int irq, void *dev_id) { struct irq_info *i = dev_id; struct list_head *l, *end = NULL; int pass_counter = 0, handled = 0; pr_debug("%s(%d): start\n", __func__, irq); spin_lock(&i->lock); l = i->head; do { struct uart_8250_port *up; struct uart_port *port; up = list_entry(l, struct uart_8250_port, list); port = &up->port; if (port->handle_irq(port)) { handled = 1; end = NULL; } else if (end == NULL) end = l; l = l->next; if (l == i->head && pass_counter++ > PASS_LIMIT) { /* If we hit this, we're dead. */ printk_ratelimited(KERN_ERR "serial8250: too much work for irq%d\n", irq); break; } } while (l != end); spin_unlock(&i->lock); pr_debug("%s(%d): end\n", __func__, irq); return IRQ_RETVAL(handled); } /* * To support ISA shared interrupts, we need to have one interrupt * handler that ensures that the IRQ line has been deasserted * before returning. Failing to do this will result in the IRQ * line being stuck active, and, since ISA irqs are edge triggered, * no more IRQs will be seen. */ static void serial_do_unlink(struct irq_info *i, struct uart_8250_port *up) { spin_lock_irq(&i->lock); if (!list_empty(i->head)) { if (i->head == &up->list) i->head = i->head->next; list_del(&up->list); } else { BUG_ON(i->head != &up->list); i->head = NULL; } spin_unlock_irq(&i->lock); /* List empty so throw away the hash node */ if (i->head == NULL) { hlist_del(&i->node); kfree(i); } } static int serial_link_irq_chain(struct uart_8250_port *up) { struct hlist_head *h; struct hlist_node *n; struct irq_info *i; int ret; mutex_lock(&hash_mutex); h = &irq_lists[up->port.irq % NR_IRQ_HASH]; hlist_for_each(n, h) { i = hlist_entry(n, struct irq_info, node); if (i->irq == up->port.irq) break; } if (n == NULL) { i = kzalloc(sizeof(struct irq_info), GFP_KERNEL); if (i == NULL) { mutex_unlock(&hash_mutex); return -ENOMEM; } spin_lock_init(&i->lock); i->irq = up->port.irq; hlist_add_head(&i->node, h); } mutex_unlock(&hash_mutex); spin_lock_irq(&i->lock); if (i->head) { list_add(&up->list, i->head); spin_unlock_irq(&i->lock); ret = 0; } else { INIT_LIST_HEAD(&up->list); i->head = &up->list; spin_unlock_irq(&i->lock); ret = request_irq(up->port.irq, serial8250_interrupt, up->port.irqflags, up->port.name, i); if (ret < 0) serial_do_unlink(i, up); } return ret; } static void serial_unlink_irq_chain(struct uart_8250_port *up) { /* * yes, some broken gcc emit "warning: 'i' may be used uninitialized" * but no, we are not going to take a patch that assigns NULL below. */ struct irq_info *i; struct hlist_node *n; struct hlist_head *h; mutex_lock(&hash_mutex); h = &irq_lists[up->port.irq % NR_IRQ_HASH]; hlist_for_each(n, h) { i = hlist_entry(n, struct irq_info, node); if (i->irq == up->port.irq) break; } BUG_ON(n == NULL); BUG_ON(i->head == NULL); if (list_empty(i->head)) free_irq(up->port.irq, i); serial_do_unlink(i, up); mutex_unlock(&hash_mutex); } /* * This function is used to handle ports that do not have an * interrupt. This doesn't work very well for 16450's, but gives * barely passable results for a 16550A. (Although at the expense * of much CPU overhead). */ static void serial8250_timeout(unsigned long data) { struct uart_8250_port *up = (struct uart_8250_port *)data; up->port.handle_irq(&up->port); mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port)); } static void serial8250_backup_timeout(unsigned long data) { struct uart_8250_port *up = (struct uart_8250_port *)data; unsigned int iir, ier = 0, lsr; unsigned long flags; spin_lock_irqsave(&up->port.lock, flags); /* * Must disable interrupts or else we risk racing with the interrupt * based handler. */ if (up->port.irq) { ier = serial_in(up, UART_IER); serial_out(up, UART_IER, 0); } iir = serial_in(up, UART_IIR); /* * This should be a safe test for anyone who doesn't trust the * IIR bits on their UART, but it's specifically designed for * the "Diva" UART used on the management processor on many HP * ia64 and parisc boxes. */ lsr = serial_in(up, UART_LSR); up->lsr_saved_flags |= lsr & LSR_SAVE_FLAGS; if ((iir & UART_IIR_NO_INT) && (up->ier & UART_IER_THRI) && (!uart_circ_empty(&up->port.state->xmit) || up->port.x_char) && (lsr & UART_LSR_THRE)) { iir &= ~(UART_IIR_ID | UART_IIR_NO_INT); iir |= UART_IIR_THRI; } if (!(iir & UART_IIR_NO_INT)) serial8250_tx_chars(up); if (up->port.irq) serial_out(up, UART_IER, ier); spin_unlock_irqrestore(&up->port.lock, flags); /* Standard timer interval plus 0.2s to keep the port running */ mod_timer(&up->timer, jiffies + uart_poll_timeout(&up->port) + HZ / 5); } static int univ8250_setup_irq(struct uart_8250_port *up) { struct uart_port *port = &up->port; int retval = 0; /* * The above check will only give an accurate result the first time * the port is opened so this value needs to be preserved. */ if (up->bugs & UART_BUG_THRE) { pr_debug("ttyS%d - using backup timer\n", serial_index(port)); up->timer.function = serial8250_backup_timeout; up->timer.data = (unsigned long)up; mod_timer(&up->timer, jiffies + uart_poll_timeout(port) + HZ / 5); } /* * If the "interrupt" for this port doesn't correspond with any * hardware interrupt, we use a timer-based system. The original * driver used to do this with IRQ0. */ if (!port->irq) { up->timer.data = (unsigned long)up; mod_timer(&up->timer, jiffies + uart_poll_timeout(port)); } else retval = serial_link_irq_chain(up); return retval; } static void univ8250_release_irq(struct uart_8250_port *up) { struct uart_port *port = &up->port; del_timer_sync(&up->timer); up->timer.function = serial8250_timeout; if (port->irq) serial_unlink_irq_chain(up); } #ifdef CONFIG_SERIAL_8250_RSA static int serial8250_request_rsa_resource(struct uart_8250_port *up) { unsigned long start = UART_RSA_BASE << up->port.regshift; unsigned int size = 8 << up->port.regshift; struct uart_port *port = &up->port; int ret = -EINVAL; switch (port->iotype) { case UPIO_HUB6: case UPIO_PORT: start += port->iobase; if (request_region(start, size, "serial-rsa")) ret = 0; else ret = -EBUSY; break; } return ret; } static void serial8250_release_rsa_resource(struct uart_8250_port *up) { unsigned long offset = UART_RSA_BASE << up->port.regshift; unsigned int size = 8 << up->port.regshift; struct uart_port *port = &up->port; switch (port->iotype) { case UPIO_HUB6: case UPIO_PORT: release_region(port->iobase + offset, size); break; } } #endif static const struct uart_ops *base_ops; static struct uart_ops univ8250_port_ops; static const struct uart_8250_ops univ8250_driver_ops = { .setup_irq = univ8250_setup_irq, .release_irq = univ8250_release_irq, }; static struct uart_8250_port serial8250_ports[UART_NR]; /** * serial8250_get_port - retrieve struct uart_8250_port * @line: serial line number * * This function retrieves struct uart_8250_port for the specific line. * This struct *must* *not* be used to perform a 8250 or serial core operation * which is not accessible otherwise. Its only purpose is to make the struct * accessible to the runtime-pm callbacks for context suspend/restore. * The lock assumption made here is none because runtime-pm suspend/resume * callbacks should not be invoked if there is any operation performed on the * port. */ struct uart_8250_port *serial8250_get_port(int line) { return &serial8250_ports[line]; } EXPORT_SYMBOL_GPL(serial8250_get_port); static void (*serial8250_isa_config)(int port, struct uart_port *up, u32 *capabilities); void serial8250_set_isa_configurator( void (*v)(int port, struct uart_port *up, u32 *capabilities)) { serial8250_isa_config = v; } EXPORT_SYMBOL(serial8250_set_isa_configurator); #ifdef CONFIG_SERIAL_8250_RSA static void univ8250_config_port(struct uart_port *port, int flags) { struct uart_8250_port *up = up_to_u8250p(port); up->probe &= ~UART_PROBE_RSA; if (port->type == PORT_RSA) { if (serial8250_request_rsa_resource(up) == 0) up->probe |= UART_PROBE_RSA; } else if (flags & UART_CONFIG_TYPE) { int i; for (i = 0; i < probe_rsa_count; i++) { if (probe_rsa[i] == up->port.iobase) { if (serial8250_request_rsa_resource(up) == 0) up->probe |= UART_PROBE_RSA; break; } } } base_ops->config_port(port, flags); if (port->type != PORT_RSA && up->probe & UART_PROBE_RSA) serial8250_release_rsa_resource(up); } static int univ8250_request_port(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); int ret; ret = base_ops->request_port(port); if (ret == 0 && port->type == PORT_RSA) { ret = serial8250_request_rsa_resource(up); if (ret < 0) base_ops->release_port(port); } return ret; } static void univ8250_release_port(struct uart_port *port) { struct uart_8250_port *up = up_to_u8250p(port); if (port->type == PORT_RSA) serial8250_release_rsa_resource(up); base_ops->release_port(port); } static void univ8250_rsa_support(struct uart_ops *ops) { ops->config_port = univ8250_config_port; ops->request_port = univ8250_request_port; ops->release_port = univ8250_release_port; } #else #define univ8250_rsa_support(x) do { } while (0) #endif /* CONFIG_SERIAL_8250_RSA */ static inline void serial8250_apply_quirks(struct uart_8250_port *up) { up->port.quirks |= skip_txen_test ? UPQ_NO_TXEN_TEST : 0; } static void __init serial8250_isa_init_ports(void) { struct uart_8250_port *up; static int first = 1; int i, irqflag = 0; if (!first) return; first = 0; if (nr_uarts > UART_NR) nr_uarts = UART_NR; for (i = 0; i < nr_uarts; i++) { struct uart_8250_port *up = &serial8250_ports[i]; struct uart_port *port = &up->port; port->line = i; serial8250_init_port(up); if (!base_ops) base_ops = port->ops; port->ops = &univ8250_port_ops; init_timer(&up->timer); up->timer.function = serial8250_timeout; up->ops = &univ8250_driver_ops; /* * ALPHA_KLUDGE_MCR needs to be killed. */ up->mcr_mask = ~ALPHA_KLUDGE_MCR; up->mcr_force = ALPHA_KLUDGE_MCR; serial8250_set_defaults(up); } /* chain base port ops to support Remote Supervisor Adapter */ univ8250_port_ops = *base_ops; univ8250_rsa_support(&univ8250_port_ops); if (share_irqs) irqflag = IRQF_SHARED; for (i = 0, up = serial8250_ports; i < ARRAY_SIZE(old_serial_port) && i < nr_uarts; i++, up++) { struct uart_port *port = &up->port; port->iobase = old_serial_port[i].port; port->irq = irq_canonicalize(old_serial_port[i].irq); port->irqflags = 0; port->uartclk = old_serial_port[i].baud_base * 16; port->flags = old_serial_port[i].flags; port->hub6 = 0; port->membase = old_serial_port[i].iomem_base; port->iotype = old_serial_port[i].io_type; port->regshift = old_serial_port[i].iomem_reg_shift; port->irqflags |= irqflag; if (serial8250_isa_config != NULL) serial8250_isa_config(i, &up->port, &up->capabilities); } } static void __init serial8250_register_ports(struct uart_driver *drv, struct device *dev) { int i; for (i = 0; i < nr_uarts; i++) { struct uart_8250_port *up = &serial8250_ports[i]; if (up->port.type == PORT_8250_CIR) continue; if (up->port.dev) continue; up->port.dev = dev; serial8250_apply_quirks(up); uart_add_one_port(drv, &up->port); } } #ifdef CONFIG_SERIAL_8250_CONSOLE static void univ8250_console_write(struct console *co, const char *s, unsigned int count) { struct uart_8250_port *up = &serial8250_ports[co->index]; serial8250_console_write(up, s, count); } static int univ8250_console_setup(struct console *co, char *options) { struct uart_port *port; int retval; /* * Check whether an invalid uart number has been specified, and * if so, search for the first available port that does have * console support. */ if (co->index >= nr_uarts) co->index = 0; port = &serial8250_ports[co->index].port; /* link port to console */ port->cons = co; retval = serial8250_console_setup(port, options, false); if (retval != 0) port->cons = NULL; return retval; } /** * univ8250_console_match - non-standard console matching * @co: registering console * @name: name from console command line * @idx: index from console command line * @options: ptr to option string from console command line * * Only attempts to match console command lines of the form: * console=uart[8250],io|mmio|mmio16|mmio32,<addr>[,<options>] * console=uart[8250],0x<addr>[,<options>] * This form is used to register an initial earlycon boot console and * replace it with the serial8250_console at 8250 driver init. * * Performs console setup for a match (as required by interface) * If no <options> are specified, then assume the h/w is already setup. * * Returns 0 if console matches; otherwise non-zero to use default matching */ static int univ8250_console_match(struct console *co, char *name, int idx, char *options) { char match[] = "uart"; /* 8250-specific earlycon name */ unsigned char iotype; resource_size_t addr; int i; if (strncmp(name, match, 4) != 0) return -ENODEV; if (uart_parse_earlycon(options, &iotype, &addr, &options)) return -ENODEV; /* try to match the port specified on the command line */ for (i = 0; i < nr_uarts; i++) { struct uart_port *port = &serial8250_ports[i].port; if (port->iotype != iotype) continue; if ((iotype == UPIO_MEM || iotype == UPIO_MEM16 || iotype == UPIO_MEM32 || iotype == UPIO_MEM32BE) && (port->mapbase != addr)) continue; if (iotype == UPIO_PORT && port->iobase != addr) continue; co->index = i; port->cons = co; return serial8250_console_setup(port, options, true); } return -ENODEV; } static struct console univ8250_console = { .name = "ttyS", .write = univ8250_console_write, .device = uart_console_device, .setup = univ8250_console_setup, .match = univ8250_console_match, .flags = CON_PRINTBUFFER | CON_ANYTIME, .index = -1, .data = &serial8250_reg, }; static int __init univ8250_console_init(void) { if (nr_uarts == 0) return -ENODEV; serial8250_isa_init_ports(); register_console(&univ8250_console); return 0; } console_initcall(univ8250_console_init); #define SERIAL8250_CONSOLE (&univ8250_console) #else #define SERIAL8250_CONSOLE NULL #endif static struct uart_driver serial8250_reg = { .owner = THIS_MODULE, .driver_name = "serial", .dev_name = "ttyS", .major = TTY_MAJOR, .minor = 64, .cons = SERIAL8250_CONSOLE, }; /* * early_serial_setup - early registration for 8250 ports * * Setup an 8250 port structure prior to console initialisation. Use * after console initialisation will cause undefined behaviour. */ int __init early_serial_setup(struct uart_port *port) { struct uart_port *p; if (port->line >= ARRAY_SIZE(serial8250_ports) || nr_uarts == 0) return -ENODEV; serial8250_isa_init_ports(); p = &serial8250_ports[port->line].port; p->iobase = port->iobase; p->membase = port->membase; p->irq = port->irq; p->irqflags = port->irqflags; p->uartclk = port->uartclk; p->fifosize = port->fifosize; p->regshift = port->regshift; p->iotype = port->iotype; p->flags = port->flags; p->mapbase = port->mapbase; p->mapsize = port->mapsize; p->private_data = port->private_data; p->type = port->type; p->line = port->line; serial8250_set_defaults(up_to_u8250p(p)); if (port->serial_in) p->serial_in = port->serial_in; if (port->serial_out) p->serial_out = port->serial_out; if (port->handle_irq) p->handle_irq = port->handle_irq; return 0; } /** * serial8250_suspend_port - suspend one serial port * @line: serial line number * * Suspend one serial port. */ void serial8250_suspend_port(int line) { struct uart_8250_port *up = &serial8250_ports[line]; struct uart_port *port = &up->port; if (!console_suspend_enabled && uart_console(port) && port->type != PORT_8250) { unsigned char canary = 0xa5; serial_out(up, UART_SCR, canary); if (serial_in(up, UART_SCR) == canary) up->canary = canary; } uart_suspend_port(&serial8250_reg, port); } EXPORT_SYMBOL(serial8250_suspend_port); /** * serial8250_resume_port - resume one serial port * @line: serial line number * * Resume one serial port. */ void serial8250_resume_port(int line) { struct uart_8250_port *up = &serial8250_ports[line]; struct uart_port *port = &up->port; up->canary = 0; if (up->capabilities & UART_NATSEMI) { /* Ensure it's still in high speed mode */ serial_port_out(port, UART_LCR, 0xE0); ns16550a_goto_highspeed(up); serial_port_out(port, UART_LCR, 0); port->uartclk = 921600*16; } uart_resume_port(&serial8250_reg, port); } EXPORT_SYMBOL(serial8250_resume_port); /* * Register a set of serial devices attached to a platform device. The * list is terminated with a zero flags entry, which means we expect * all entries to have at least UPF_BOOT_AUTOCONF set. */ static int serial8250_probe(struct platform_device *dev) { struct plat_serial8250_port *p = dev_get_platdata(&dev->dev); struct uart_8250_port uart; int ret, i, irqflag = 0; memset(&uart, 0, sizeof(uart)); if (share_irqs) irqflag = IRQF_SHARED; for (i = 0; p && p->flags != 0; p++, i++) { uart.port.iobase = p->iobase; uart.port.membase = p->membase; uart.port.irq = p->irq; uart.port.irqflags = p->irqflags; uart.port.uartclk = p->uartclk; uart.port.regshift = p->regshift; uart.port.iotype = p->iotype; uart.port.flags = p->flags; uart.port.mapbase = p->mapbase; uart.port.hub6 = p->hub6; uart.port.private_data = p->private_data; uart.port.type = p->type; uart.port.serial_in = p->serial_in; uart.port.serial_out = p->serial_out; uart.port.handle_irq = p->handle_irq; uart.port.handle_break = p->handle_break; uart.port.set_termios = p->set_termios; uart.port.set_ldisc = p->set_ldisc; uart.port.get_mctrl = p->get_mctrl; uart.port.pm = p->pm; uart.port.dev = &dev->dev; uart.port.irqflags |= irqflag; ret = serial8250_register_8250_port(&uart); if (ret < 0) { dev_err(&dev->dev, "unable to register port at index %d " "(IO%lx MEM%llx IRQ%d): %d\n", i, p->iobase, (unsigned long long)p->mapbase, p->irq, ret); } } return 0; } /* * Remove serial ports registered against a platform device. */ static int serial8250_remove(struct platform_device *dev) { int i; for (i = 0; i < nr_uarts; i++) { struct uart_8250_port *up = &serial8250_ports[i]; if (up->port.dev == &dev->dev) serial8250_unregister_port(i); } return 0; } static int serial8250_suspend(struct platform_device *dev, pm_message_t state) { int i; for (i = 0; i < UART_NR; i++) { struct uart_8250_port *up = &serial8250_ports[i]; if (up->port.type != PORT_UNKNOWN && up->port.dev == &dev->dev) uart_suspend_port(&serial8250_reg, &up->port); } return 0; } static int serial8250_resume(struct platform_device *dev) { int i; for (i = 0; i < UART_NR; i++) { struct uart_8250_port *up = &serial8250_ports[i]; if (up->port.type != PORT_UNKNOWN && up->port.dev == &dev->dev) serial8250_resume_port(i); } return 0; } static struct platform_driver serial8250_isa_driver = { .probe = serial8250_probe, .remove = serial8250_remove, .suspend = serial8250_suspend, .resume = serial8250_resume, .driver = { .name = "serial8250", }, }; /* * This "device" covers _all_ ISA 8250-compatible serial devices listed * in the table in include/asm/serial.h */ static struct platform_device *serial8250_isa_devs; /* * serial8250_register_8250_port and serial8250_unregister_port allows for * 16x50 serial ports to be configured at run-time, to support PCMCIA * modems and PCI multiport cards. */ static DEFINE_MUTEX(serial_mutex); static struct uart_8250_port *serial8250_find_match_or_unused(struct uart_port *port) { int i; /* * First, find a port entry which matches. */ for (i = 0; i < nr_uarts; i++) if (uart_match_port(&serial8250_ports[i].port, port)) return &serial8250_ports[i]; /* try line number first if still available */ i = port->line; if (i < nr_uarts && serial8250_ports[i].port.type == PORT_UNKNOWN && serial8250_ports[i].port.iobase == 0) return &serial8250_ports[i]; /* * We didn't find a matching entry, so look for the first * free entry. We look for one which hasn't been previously * used (indicated by zero iobase). */ for (i = 0; i < nr_uarts; i++) if (serial8250_ports[i].port.type == PORT_UNKNOWN && serial8250_ports[i].port.iobase == 0) return &serial8250_ports[i]; /* * That also failed. Last resort is to find any entry which * doesn't have a real port associated with it. */ for (i = 0; i < nr_uarts; i++) if (serial8250_ports[i].port.type == PORT_UNKNOWN) return &serial8250_ports[i]; return NULL; } static void serial_8250_overrun_backoff_work(struct work_struct *work) { struct uart_8250_port *up = container_of(to_delayed_work(work), struct uart_8250_port, overrun_backoff); struct uart_port *port = &up->port; unsigned long flags; spin_lock_irqsave(&port->lock, flags); up->ier |= UART_IER_RLSI | UART_IER_RDI; up->port.read_status_mask |= UART_LSR_DR; serial_out(up, UART_IER, up->ier); spin_unlock_irqrestore(&port->lock, flags); } /** * serial8250_register_8250_port - register a serial port * @up: serial port template * * Configure the serial port specified by the request. If the * port exists and is in use, it is hung up and unregistered * first. * * The port is then probed and if necessary the IRQ is autodetected * If this fails an error is returned. * * On success the port is ready to use and the line number is returned. */ int serial8250_register_8250_port(struct uart_8250_port *up) { struct uart_8250_port *uart; int ret = -ENOSPC; if (up->port.uartclk == 0) return -EINVAL; mutex_lock(&serial_mutex); uart = serial8250_find_match_or_unused(&up->port); if (uart && uart->port.type != PORT_8250_CIR) { if (uart->port.dev) uart_remove_one_port(&serial8250_reg, &uart->port); uart->port.iobase = up->port.iobase; uart->port.membase = up->port.membase; uart->port.irq = up->port.irq; uart->port.irqflags = up->port.irqflags; uart->port.uartclk = up->port.uartclk; uart->port.fifosize = up->port.fifosize; uart->port.regshift = up->port.regshift; uart->port.iotype = up->port.iotype; uart->port.flags = up->port.flags | UPF_BOOT_AUTOCONF; uart->bugs = up->bugs; uart->port.mapbase = up->port.mapbase; uart->port.mapsize = up->port.mapsize; uart->port.private_data = up->port.private_data; uart->tx_loadsz = up->tx_loadsz; uart->capabilities = up->capabilities; uart->port.throttle = up->port.throttle; uart->port.unthrottle = up->port.unthrottle; uart->port.rs485_config = up->port.rs485_config; uart->port.rs485 = up->port.rs485; uart->dma = up->dma; /* Take tx_loadsz from fifosize if it wasn't set separately */ if (uart->port.fifosize && !uart->tx_loadsz) uart->tx_loadsz = uart->port.fifosize; if (up->port.dev) uart->port.dev = up->port.dev; if (up->port.flags & UPF_FIXED_TYPE) uart->port.type = up->port.type; serial8250_set_defaults(uart); /* Possibly override default I/O functions. */ if (up->port.serial_in) uart->port.serial_in = up->port.serial_in; if (up->port.serial_out) uart->port.serial_out = up->port.serial_out; if (up->port.handle_irq) uart->port.handle_irq = up->port.handle_irq; /* Possibly override set_termios call */ if (up->port.set_termios) uart->port.set_termios = up->port.set_termios; if (up->port.set_ldisc) uart->port.set_ldisc = up->port.set_ldisc; if (up->port.get_mctrl) uart->port.get_mctrl = up->port.get_mctrl; if (up->port.set_mctrl) uart->port.set_mctrl = up->port.set_mctrl; if (up->port.startup) uart->port.startup = up->port.startup; if (up->port.shutdown) uart->port.shutdown = up->port.shutdown; if (up->port.pm) uart->port.pm = up->port.pm; if (up->port.handle_break) uart->port.handle_break = up->port.handle_break; if (up->dl_read) uart->dl_read = up->dl_read; if (up->dl_write) uart->dl_write = up->dl_write; if (uart->port.type != PORT_8250_CIR) { if (serial8250_isa_config != NULL) serial8250_isa_config(0, &uart->port, &uart->capabilities); serial8250_apply_quirks(uart); ret = uart_add_one_port(&serial8250_reg, &uart->port); if (ret) goto err; ret = uart->port.line; } else { dev_info(uart->port.dev, "skipping CIR port at 0x%lx / 0x%llx, IRQ %d\n", uart->port.iobase, (unsigned long long)uart->port.mapbase, uart->port.irq); ret = 0; } /* Initialise interrupt backoff work if required */ if (up->overrun_backoff_time_ms > 0) { uart->overrun_backoff_time_ms = up->overrun_backoff_time_ms; INIT_DELAYED_WORK(&uart->overrun_backoff, serial_8250_overrun_backoff_work); } else { uart->overrun_backoff_time_ms = 0; } } mutex_unlock(&serial_mutex); return ret; err: uart->port.dev = NULL; mutex_unlock(&serial_mutex); return ret; } EXPORT_SYMBOL(serial8250_register_8250_port); /** * serial8250_unregister_port - remove a 16x50 serial port at runtime * @line: serial line number * * Remove one serial port. This may not be called from interrupt * context. We hand the port back to the our control. */ void serial8250_unregister_port(int line) { struct uart_8250_port *uart = &serial8250_ports[line]; mutex_lock(&serial_mutex); if (uart->em485) { unsigned long flags; spin_lock_irqsave(&uart->port.lock, flags); serial8250_em485_destroy(uart); spin_unlock_irqrestore(&uart->port.lock, flags); } uart_remove_one_port(&serial8250_reg, &uart->port); if (serial8250_isa_devs) { uart->port.flags &= ~UPF_BOOT_AUTOCONF; uart->port.type = PORT_UNKNOWN; uart->port.dev = &serial8250_isa_devs->dev; uart->capabilities = 0; serial8250_apply_quirks(uart); uart_add_one_port(&serial8250_reg, &uart->port); } else { uart->port.dev = NULL; } mutex_unlock(&serial_mutex); } EXPORT_SYMBOL(serial8250_unregister_port); static int __init serial8250_init(void) { int ret; if (nr_uarts == 0) return -ENODEV; serial8250_isa_init_ports(); pr_info("Serial: 8250/16550 driver, %d ports, IRQ sharing %sabled\n", nr_uarts, share_irqs ? "en" : "dis"); #ifdef CONFIG_SPARC ret = sunserial_register_minors(&serial8250_reg, UART_NR); #else serial8250_reg.nr = UART_NR; ret = uart_register_driver(&serial8250_reg); #endif if (ret) goto out; ret = serial8250_pnp_init(); if (ret) goto unreg_uart_drv; serial8250_isa_devs = platform_device_alloc("serial8250", PLAT8250_DEV_LEGACY); if (!serial8250_isa_devs) { ret = -ENOMEM; goto unreg_pnp; } ret = platform_device_add(serial8250_isa_devs); if (ret) goto put_dev; serial8250_register_ports(&serial8250_reg, &serial8250_isa_devs->dev); ret = platform_driver_register(&serial8250_isa_driver); if (ret == 0) goto out; platform_device_del(serial8250_isa_devs); put_dev: platform_device_put(serial8250_isa_devs); unreg_pnp: serial8250_pnp_exit(); unreg_uart_drv: #ifdef CONFIG_SPARC sunserial_unregister_minors(&serial8250_reg, UART_NR); #else uart_unregister_driver(&serial8250_reg); #endif out: return ret; } static void __exit serial8250_exit(void) { struct platform_device *isa_dev = serial8250_isa_devs; /* * This tells serial8250_unregister_port() not to re-register * the ports (thereby making serial8250_isa_driver permanently * in use.) */ serial8250_isa_devs = NULL; platform_driver_unregister(&serial8250_isa_driver); platform_device_unregister(isa_dev); serial8250_pnp_exit(); #ifdef CONFIG_SPARC sunserial_unregister_minors(&serial8250_reg, UART_NR); #else uart_unregister_driver(&serial8250_reg); #endif } module_init(serial8250_init); module_exit(serial8250_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Generic 8250/16x50 serial driver"); module_param_hw(share_irqs, uint, other, 0644); MODULE_PARM_DESC(share_irqs, "Share IRQs with other non-8250/16x50 devices (unsafe)"); module_param(nr_uarts, uint, 0644); MODULE_PARM_DESC(nr_uarts, "Maximum number of UARTs supported. (1-" __MODULE_STRING(CONFIG_SERIAL_8250_NR_UARTS) ")"); module_param(skip_txen_test, uint, 0644); MODULE_PARM_DESC(skip_txen_test, "Skip checking for the TXEN bug at init time"); #ifdef CONFIG_SERIAL_8250_RSA module_param_hw_array(probe_rsa, ulong, ioport, &probe_rsa_count, 0444); MODULE_PARM_DESC(probe_rsa, "Probe I/O ports for RSA"); #endif MODULE_ALIAS_CHARDEV_MAJOR(TTY_MAJOR); #ifdef CONFIG_SERIAL_8250_DEPRECATED_OPTIONS #ifndef MODULE /* This module was renamed to 8250_core in 3.7. Keep the old "8250" name * working as well for the module options so we don't break people. We * need to keep the names identical and the convenient macros will happily * refuse to let us do that by failing the build with redefinition errors * of global variables. So we stick them inside a dummy function to avoid * those conflicts. The options still get parsed, and the redefined * MODULE_PARAM_PREFIX lets us keep the "8250." syntax alive. * * This is hacky. I'm sorry. */ static void __used s8250_options(void) { #undef MODULE_PARAM_PREFIX #define MODULE_PARAM_PREFIX "8250_core." module_param_cb(share_irqs, &param_ops_uint, &share_irqs, 0644); module_param_cb(nr_uarts, &param_ops_uint, &nr_uarts, 0644); module_param_cb(skip_txen_test, &param_ops_uint, &skip_txen_test, 0644); #ifdef CONFIG_SERIAL_8250_RSA __module_param_call(MODULE_PARAM_PREFIX, probe_rsa, &param_array_ops, .arr = &__param_arr_probe_rsa, 0444, -1, 0); #endif } #else MODULE_ALIAS("8250_core"); #endif #endif
29 49 211 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ #ifndef __KVM_IODEV_H__ #define __KVM_IODEV_H__ #include <linux/kvm_types.h> #include <linux/errno.h> struct kvm_io_device; struct kvm_vcpu; /** * kvm_io_device_ops are called under kvm slots_lock. * read and write handlers return 0 if the transaction has been handled, * or non-zero to have it passed to the next device. **/ struct kvm_io_device_ops { int (*read)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *val); int (*write)(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val); void (*destructor)(struct kvm_io_device *this); }; struct kvm_io_device { const struct kvm_io_device_ops *ops; }; static inline void kvm_iodevice_init(struct kvm_io_device *dev, const struct kvm_io_device_ops *ops) { dev->ops = ops; } static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, void *v) { return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, gpa_t addr, int l, const void *v) { return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v) : -EOPNOTSUPP; } static inline void kvm_iodevice_destructor(struct kvm_io_device *dev) { if (dev->ops->destructor) dev->ops->destructor(dev); } #endif /* __KVM_IODEV_H__ */
1 1 1 11 15 11 15 1 1 1 13 13 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/rfkill.h> #include <linux/nfc.h> #include <net/genetlink.h> #include "nfc.h" #define VERSION "0.1" #define NFC_CHECK_PRES_FREQ_MS 2000 int nfc_devlist_generation; DEFINE_MUTEX(nfc_devlist_mutex); /* NFC device ID bitmap */ static DEFINE_IDA(nfc_index_ida); int nfc_fw_download(struct nfc_dev *dev, const char *firmware_name) { int rc = 0; pr_debug("%s do firmware %s\n", dev_name(&dev->dev), firmware_name); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dev_up) { rc = -EBUSY; goto error; } if (!dev->ops->fw_download) { rc = -EOPNOTSUPP; goto error; } dev->fw_download_in_progress = true; rc = dev->ops->fw_download(dev, firmware_name); if (rc) dev->fw_download_in_progress = false; error: device_unlock(&dev->dev); return rc; } /** * nfc_fw_download_done - inform that a firmware download was completed * * @dev: The nfc device to which firmware was downloaded * @firmware_name: The firmware filename * @result: The positive value of a standard errno value */ int nfc_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { dev->fw_download_in_progress = false; return nfc_genl_fw_download_done(dev, firmware_name, result); } EXPORT_SYMBOL(nfc_fw_download_done); /** * nfc_dev_up - turn on the NFC device * * @dev: The nfc device to be turned on * * The device remains up until the nfc_dev_down function is called. */ int nfc_dev_up(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->rfkill && rfkill_blocked(dev->rfkill)) { rc = -ERFKILL; goto error; } if (dev->fw_download_in_progress) { rc = -EBUSY; goto error; } if (dev->dev_up) { rc = -EALREADY; goto error; } if (dev->ops->dev_up) rc = dev->ops->dev_up(dev); if (!rc) dev->dev_up = true; /* We have to enable the device before discovering SEs */ if (dev->ops->discover_se && dev->ops->discover_se(dev)) pr_err("SE discovery failed\n"); error: device_unlock(&dev->dev); return rc; } /** * nfc_dev_down - turn off the NFC device * * @dev: The nfc device to be turned off */ int nfc_dev_down(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -EALREADY; goto error; } if (dev->polling || dev->active_target) { rc = -EBUSY; goto error; } if (dev->ops->dev_down) dev->ops->dev_down(dev); dev->dev_up = false; error: device_unlock(&dev->dev); return rc; } static int nfc_rfkill_set_block(void *data, bool blocked) { struct nfc_dev *dev = data; pr_debug("%s blocked %d", dev_name(&dev->dev), blocked); if (!blocked) return 0; nfc_dev_down(dev); return 0; } static const struct rfkill_ops nfc_rfkill_ops = { .set_block = nfc_rfkill_set_block, }; /** * nfc_start_poll - start polling for nfc targets * * @dev: The nfc device that must start polling * @protocols: bitset of nfc protocols that must be used for polling * * The device remains polling for targets until a target is found or * the nfc_stop_poll function is called. */ int nfc_start_poll(struct nfc_dev *dev, u32 im_protocols, u32 tm_protocols) { int rc; pr_debug("dev_name %s initiator protocols 0x%x target protocols 0x%x\n", dev_name(&dev->dev), im_protocols, tm_protocols); if (!im_protocols && !tm_protocols) return -EINVAL; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (dev->polling) { rc = -EBUSY; goto error; } rc = dev->ops->start_poll(dev, im_protocols, tm_protocols); if (!rc) { dev->polling = true; dev->rf_mode = NFC_RF_NONE; } error: device_unlock(&dev->dev); return rc; } /** * nfc_stop_poll - stop polling for nfc targets * * @dev: The nfc device that must stop polling */ int nfc_stop_poll(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->polling) { rc = -EINVAL; goto error; } dev->ops->stop_poll(dev); dev->polling = false; dev->rf_mode = NFC_RF_NONE; error: device_unlock(&dev->dev); return rc; } static struct nfc_target *nfc_find_target(struct nfc_dev *dev, u32 target_idx) { int i; for (i = 0; i < dev->n_targets; i++) { if (dev->targets[i].idx == target_idx) return &dev->targets[i]; } return NULL; } int nfc_dep_link_up(struct nfc_dev *dev, int target_index, u8 comm_mode) { int rc = 0; u8 *gb; size_t gb_len; struct nfc_target *target; pr_debug("dev_name=%s comm %d\n", dev_name(&dev->dev), comm_mode); if (!dev->ops->dep_link_up) return -EOPNOTSUPP; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dep_link_up == true) { rc = -EALREADY; goto error; } gb = nfc_llcp_general_bytes(dev, &gb_len); if (gb_len > NFC_MAX_GT_LEN) { rc = -EINVAL; goto error; } target = nfc_find_target(dev, target_index); if (target == NULL) { rc = -ENOTCONN; goto error; } rc = dev->ops->dep_link_up(dev, target, comm_mode, gb, gb_len); if (!rc) { dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; } error: device_unlock(&dev->dev); return rc; } int nfc_dep_link_down(struct nfc_dev *dev) { int rc = 0; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); if (!dev->ops->dep_link_down) return -EOPNOTSUPP; device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->dep_link_up == false) { rc = -EALREADY; goto error; } rc = dev->ops->dep_link_down(dev); if (!rc) { dev->dep_link_up = false; dev->active_target = NULL; dev->rf_mode = NFC_RF_NONE; nfc_llcp_mac_is_down(dev); nfc_genl_dep_link_down_event(dev); } error: device_unlock(&dev->dev); return rc; } int nfc_dep_link_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { dev->dep_link_up = true; if (!dev->active_target && rf_mode == NFC_RF_INITIATOR) { struct nfc_target *target; target = nfc_find_target(dev, target_idx); if (target == NULL) return -ENOTCONN; dev->active_target = target; } dev->polling = false; dev->rf_mode = rf_mode; nfc_llcp_mac_is_up(dev, target_idx, comm_mode, rf_mode); return nfc_genl_dep_link_up_event(dev, target_idx, comm_mode, rf_mode); } EXPORT_SYMBOL(nfc_dep_link_is_up); /** * nfc_activate_target - prepare the target for data exchange * * @dev: The nfc device that found the target * @target_idx: index of the target that must be activated * @protocol: nfc protocol that will be used for data exchange */ int nfc_activate_target(struct nfc_dev *dev, u32 target_idx, u32 protocol) { int rc; struct nfc_target *target; pr_debug("dev_name=%s target_idx=%u protocol=%u\n", dev_name(&dev->dev), target_idx, protocol); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->active_target) { rc = -EBUSY; goto error; } target = nfc_find_target(dev, target_idx); if (target == NULL) { rc = -ENOTCONN; goto error; } rc = dev->ops->activate_target(dev, target, protocol); if (!rc) { dev->active_target = target; dev->rf_mode = NFC_RF_INITIATOR; if (dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } error: device_unlock(&dev->dev); return rc; } /** * nfc_deactivate_target - deactivate a nfc target * * @dev: The nfc device that found the target * @target_idx: index of the target that must be deactivated */ int nfc_deactivate_target(struct nfc_dev *dev, u32 target_idx, u8 mode) { int rc = 0; pr_debug("dev_name=%s target_idx=%u\n", dev_name(&dev->dev), target_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (dev->active_target == NULL) { rc = -ENOTCONN; goto error; } if (dev->active_target->idx != target_idx) { rc = -ENOTCONN; goto error; } if (dev->ops->check_presence) del_timer_sync(&dev->check_pres_timer); dev->ops->deactivate_target(dev, dev->active_target, mode); dev->active_target = NULL; error: device_unlock(&dev->dev); return rc; } /** * nfc_data_exchange - transceive data * * @dev: The nfc device that found the target * @target_idx: index of the target * @skb: data to be sent * @cb: callback called when the response is received * @cb_context: parameter for the callback function * * The user must wait for the callback before calling this function again. */ int nfc_data_exchange(struct nfc_dev *dev, u32 target_idx, struct sk_buff *skb, data_exchange_cb_t cb, void *cb_context) { int rc; pr_debug("dev_name=%s target_idx=%u skb->len=%u\n", dev_name(&dev->dev), target_idx, skb->len); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; kfree_skb(skb); goto error; } if (dev->rf_mode == NFC_RF_INITIATOR && dev->active_target != NULL) { if (dev->active_target->idx != target_idx) { rc = -EADDRNOTAVAIL; kfree_skb(skb); goto error; } if (dev->ops->check_presence) del_timer_sync(&dev->check_pres_timer); rc = dev->ops->im_transceive(dev, dev->active_target, skb, cb, cb_context); if (!rc && dev->ops->check_presence && !dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } else if (dev->rf_mode == NFC_RF_TARGET && dev->ops->tm_send != NULL) { rc = dev->ops->tm_send(dev, skb); } else { rc = -ENOTCONN; kfree_skb(skb); goto error; } error: device_unlock(&dev->dev); return rc; } struct nfc_se *nfc_find_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; list_for_each_entry(se, &dev->secure_elements, list) if (se->idx == se_idx) return se; return NULL; } EXPORT_SYMBOL(nfc_find_se); int nfc_enable_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (dev->polling) { rc = -EBUSY; goto error; } if (!dev->ops->enable_se || !dev->ops->disable_se) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state == NFC_SE_ENABLED) { rc = -EALREADY; goto error; } rc = dev->ops->enable_se(dev, se_idx); if (rc >= 0) se->state = NFC_SE_ENABLED; error: device_unlock(&dev->dev); return rc; } int nfc_disable_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (dev->shutting_down) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->enable_se || !dev->ops->disable_se) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state == NFC_SE_DISABLED) { rc = -EALREADY; goto error; } rc = dev->ops->disable_se(dev, se_idx); if (rc >= 0) se->state = NFC_SE_DISABLED; error: device_unlock(&dev->dev); return rc; } int nfc_set_remote_general_bytes(struct nfc_dev *dev, u8 *gb, u8 gb_len) { pr_debug("dev_name=%s gb_len=%d\n", dev_name(&dev->dev), gb_len); return nfc_llcp_set_remote_gb(dev, gb, gb_len); } EXPORT_SYMBOL(nfc_set_remote_general_bytes); u8 *nfc_get_local_general_bytes(struct nfc_dev *dev, size_t *gb_len) { pr_debug("dev_name=%s\n", dev_name(&dev->dev)); return nfc_llcp_general_bytes(dev, gb_len); } EXPORT_SYMBOL(nfc_get_local_general_bytes); int nfc_tm_data_received(struct nfc_dev *dev, struct sk_buff *skb) { /* Only LLCP target mode for now */ if (dev->dep_link_up == false) { kfree_skb(skb); return -ENOLINK; } return nfc_llcp_data_received(dev, skb); } EXPORT_SYMBOL(nfc_tm_data_received); int nfc_tm_activated(struct nfc_dev *dev, u32 protocol, u8 comm_mode, u8 *gb, size_t gb_len) { int rc; device_lock(&dev->dev); dev->polling = false; if (gb != NULL) { rc = nfc_set_remote_general_bytes(dev, gb, gb_len); if (rc < 0) goto out; } dev->rf_mode = NFC_RF_TARGET; if (protocol == NFC_PROTO_NFC_DEP_MASK) nfc_dep_link_is_up(dev, 0, comm_mode, NFC_RF_TARGET); rc = nfc_genl_tm_activated(dev, protocol); out: device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_tm_activated); int nfc_tm_deactivated(struct nfc_dev *dev) { dev->dep_link_up = false; dev->rf_mode = NFC_RF_NONE; return nfc_genl_tm_deactivated(dev); } EXPORT_SYMBOL(nfc_tm_deactivated); /** * nfc_alloc_send_skb - allocate a skb for data exchange responses * * @size: size to allocate * @gfp: gfp flags */ struct sk_buff *nfc_alloc_send_skb(struct nfc_dev *dev, struct sock *sk, unsigned int flags, unsigned int size, unsigned int *err) { struct sk_buff *skb; unsigned int total_size; total_size = size + dev->tx_headroom + dev->tx_tailroom + NFC_HEADER_SIZE; skb = sock_alloc_send_skb(sk, total_size, flags & MSG_DONTWAIT, err); if (skb) skb_reserve(skb, dev->tx_headroom + NFC_HEADER_SIZE); return skb; } /** * nfc_alloc_recv_skb - allocate a skb for data exchange responses * * @size: size to allocate * @gfp: gfp flags */ struct sk_buff *nfc_alloc_recv_skb(unsigned int size, gfp_t gfp) { struct sk_buff *skb; unsigned int total_size; total_size = size + 1; skb = alloc_skb(total_size, gfp); if (skb) skb_reserve(skb, 1); return skb; } EXPORT_SYMBOL(nfc_alloc_recv_skb); /** * nfc_targets_found - inform that targets were found * * @dev: The nfc device that found the targets * @targets: array of nfc targets found * @ntargets: targets array size * * The device driver must call this function when one or many nfc targets * are found. After calling this function, the device driver must stop * polling for targets. * NOTE: This function can be called with targets=NULL and n_targets=0 to * notify a driver error, meaning that the polling operation cannot complete. * IMPORTANT: this function must not be called from an atomic context. * In addition, it must also not be called from a context that would prevent * the NFC Core to call other nfc ops entry point concurrently. */ int nfc_targets_found(struct nfc_dev *dev, struct nfc_target *targets, int n_targets) { int i; pr_debug("dev_name=%s n_targets=%d\n", dev_name(&dev->dev), n_targets); for (i = 0; i < n_targets; i++) targets[i].idx = dev->target_next_idx++; device_lock(&dev->dev); if (dev->polling == false) { device_unlock(&dev->dev); return 0; } dev->polling = false; dev->targets_generation++; kfree(dev->targets); dev->targets = NULL; if (targets) { dev->targets = kmemdup(targets, n_targets * sizeof(struct nfc_target), GFP_ATOMIC); if (!dev->targets) { dev->n_targets = 0; device_unlock(&dev->dev); return -ENOMEM; } } dev->n_targets = n_targets; device_unlock(&dev->dev); nfc_genl_targets_found(dev); return 0; } EXPORT_SYMBOL(nfc_targets_found); /** * nfc_target_lost - inform that an activated target went out of field * * @dev: The nfc device that had the activated target in field * @target_idx: the nfc index of the target * * The device driver must call this function when the activated target * goes out of the field. * IMPORTANT: this function must not be called from an atomic context. * In addition, it must also not be called from a context that would prevent * the NFC Core to call other nfc ops entry point concurrently. */ int nfc_target_lost(struct nfc_dev *dev, u32 target_idx) { struct nfc_target *tg; int i; pr_debug("dev_name %s n_target %d\n", dev_name(&dev->dev), target_idx); device_lock(&dev->dev); for (i = 0; i < dev->n_targets; i++) { tg = &dev->targets[i]; if (tg->idx == target_idx) break; } if (i == dev->n_targets) { device_unlock(&dev->dev); return -EINVAL; } dev->targets_generation++; dev->n_targets--; dev->active_target = NULL; if (dev->n_targets) { memcpy(&dev->targets[i], &dev->targets[i + 1], (dev->n_targets - i) * sizeof(struct nfc_target)); } else { kfree(dev->targets); dev->targets = NULL; } device_unlock(&dev->dev); nfc_genl_target_lost(dev, target_idx); return 0; } EXPORT_SYMBOL(nfc_target_lost); inline void nfc_driver_failure(struct nfc_dev *dev, int err) { nfc_targets_found(dev, NULL, 0); } EXPORT_SYMBOL(nfc_driver_failure); int nfc_add_se(struct nfc_dev *dev, u32 se_idx, u16 type) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); se = nfc_find_se(dev, se_idx); if (se) return -EALREADY; se = kzalloc(sizeof(struct nfc_se), GFP_KERNEL); if (!se) return -ENOMEM; se->idx = se_idx; se->type = type; se->state = NFC_SE_DISABLED; INIT_LIST_HEAD(&se->list); list_add(&se->list, &dev->secure_elements); rc = nfc_genl_se_added(dev, se_idx, type); if (rc < 0) { list_del(&se->list); kfree(se); return rc; } return 0; } EXPORT_SYMBOL(nfc_add_se); int nfc_remove_se(struct nfc_dev *dev, u32 se_idx) { struct nfc_se *se, *n; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); list_for_each_entry_safe(se, n, &dev->secure_elements, list) if (se->idx == se_idx) { rc = nfc_genl_se_removed(dev, se_idx); if (rc < 0) return rc; list_del(&se->list); kfree(se); return 0; } return -EINVAL; } EXPORT_SYMBOL(nfc_remove_se); int nfc_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { int rc; pr_debug("transaction: %x\n", se_idx); device_lock(&dev->dev); if (!evt_transaction) { rc = -EPROTO; goto out; } rc = nfc_genl_se_transaction(dev, se_idx, evt_transaction); out: device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_se_transaction); int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx) { int rc; pr_debug("connectivity: %x\n", se_idx); device_lock(&dev->dev); rc = nfc_genl_se_connectivity(dev, se_idx); device_unlock(&dev->dev); return rc; } EXPORT_SYMBOL(nfc_se_connectivity); static void nfc_release(struct device *d) { struct nfc_dev *dev = to_nfc_dev(d); struct nfc_se *se, *n; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); nfc_genl_data_exit(&dev->genl_data); kfree(dev->targets); list_for_each_entry_safe(se, n, &dev->secure_elements, list) { nfc_genl_se_removed(dev, se->idx); list_del(&se->list); kfree(se); } ida_simple_remove(&nfc_index_ida, dev->idx); kfree(dev); } static void nfc_check_pres_work(struct work_struct *work) { struct nfc_dev *dev = container_of(work, struct nfc_dev, check_pres_work); int rc; device_lock(&dev->dev); if (dev->active_target && timer_pending(&dev->check_pres_timer) == 0) { rc = dev->ops->check_presence(dev, dev->active_target); if (rc == -EOPNOTSUPP) goto exit; if (rc) { u32 active_target_idx = dev->active_target->idx; device_unlock(&dev->dev); nfc_target_lost(dev, active_target_idx); return; } if (!dev->shutting_down) mod_timer(&dev->check_pres_timer, jiffies + msecs_to_jiffies(NFC_CHECK_PRES_FREQ_MS)); } exit: device_unlock(&dev->dev); } static void nfc_check_pres_timeout(unsigned long data) { struct nfc_dev *dev = (struct nfc_dev *)data; schedule_work(&dev->check_pres_work); } struct class nfc_class = { .name = "nfc", .dev_release = nfc_release, }; EXPORT_SYMBOL(nfc_class); static int match_idx(struct device *d, const void *data) { struct nfc_dev *dev = to_nfc_dev(d); const unsigned int *idx = data; return dev->idx == *idx; } struct nfc_dev *nfc_get_device(unsigned int idx) { struct device *d; d = class_find_device(&nfc_class, NULL, &idx, match_idx); if (!d) return NULL; return to_nfc_dev(d); } /** * nfc_allocate_device - allocate a new nfc device * * @ops: device operations * @supported_protocols: NFC protocols supported by the device */ struct nfc_dev *nfc_allocate_device(struct nfc_ops *ops, u32 supported_protocols, int tx_headroom, int tx_tailroom) { struct nfc_dev *dev; int rc; if (!ops->start_poll || !ops->stop_poll || !ops->activate_target || !ops->deactivate_target || !ops->im_transceive) return NULL; if (!supported_protocols) return NULL; dev = kzalloc(sizeof(struct nfc_dev), GFP_KERNEL); if (!dev) return NULL; rc = ida_simple_get(&nfc_index_ida, 0, 0, GFP_KERNEL); if (rc < 0) goto err_free_dev; dev->idx = rc; dev->dev.class = &nfc_class; dev_set_name(&dev->dev, "nfc%d", dev->idx); device_initialize(&dev->dev); dev->ops = ops; dev->supported_protocols = supported_protocols; dev->tx_headroom = tx_headroom; dev->tx_tailroom = tx_tailroom; INIT_LIST_HEAD(&dev->secure_elements); nfc_genl_data_init(&dev->genl_data); dev->rf_mode = NFC_RF_NONE; /* first generation must not be 0 */ dev->targets_generation = 1; if (ops->check_presence) { init_timer(&dev->check_pres_timer); dev->check_pres_timer.data = (unsigned long)dev; dev->check_pres_timer.function = nfc_check_pres_timeout; INIT_WORK(&dev->check_pres_work, nfc_check_pres_work); } return dev; err_free_dev: kfree(dev); return NULL; } EXPORT_SYMBOL(nfc_allocate_device); /** * nfc_register_device - register a nfc device in the nfc subsystem * * @dev: The nfc device to register */ int nfc_register_device(struct nfc_dev *dev) { int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); mutex_lock(&nfc_devlist_mutex); nfc_devlist_generation++; rc = device_add(&dev->dev); mutex_unlock(&nfc_devlist_mutex); if (rc < 0) return rc; rc = nfc_llcp_register_device(dev); if (rc) pr_err("Could not register llcp device\n"); device_lock(&dev->dev); dev->rfkill = rfkill_alloc(dev_name(&dev->dev), &dev->dev, RFKILL_TYPE_NFC, &nfc_rfkill_ops, dev); if (dev->rfkill) { if (rfkill_register(dev->rfkill) < 0) { rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } } dev->shutting_down = false; device_unlock(&dev->dev); rc = nfc_genl_device_added(dev); if (rc) pr_debug("The userspace won't be notified that the device %s was added\n", dev_name(&dev->dev)); return 0; } EXPORT_SYMBOL(nfc_register_device); /** * nfc_unregister_device - unregister a nfc device in the nfc subsystem * * @dev: The nfc device to unregister */ void nfc_unregister_device(struct nfc_dev *dev) { int rc; pr_debug("dev_name=%s\n", dev_name(&dev->dev)); rc = nfc_genl_device_removed(dev); if (rc) pr_debug("The userspace won't be notified that the device %s " "was removed\n", dev_name(&dev->dev)); device_lock(&dev->dev); if (dev->rfkill) { rfkill_unregister(dev->rfkill); rfkill_destroy(dev->rfkill); dev->rfkill = NULL; } dev->shutting_down = true; device_unlock(&dev->dev); if (dev->ops->check_presence) { del_timer_sync(&dev->check_pres_timer); cancel_work_sync(&dev->check_pres_work); } nfc_llcp_unregister_device(dev); mutex_lock(&nfc_devlist_mutex); nfc_devlist_generation++; device_del(&dev->dev); mutex_unlock(&nfc_devlist_mutex); } EXPORT_SYMBOL(nfc_unregister_device); static int __init nfc_init(void) { int rc; pr_info("NFC Core ver %s\n", VERSION); rc = class_register(&nfc_class); if (rc) return rc; rc = nfc_genl_init(); if (rc) goto err_genl; /* the first generation must not be 0 */ nfc_devlist_generation = 1; rc = rawsock_init(); if (rc) goto err_rawsock; rc = nfc_llcp_init(); if (rc) goto err_llcp_sock; rc = af_nfc_init(); if (rc) goto err_af_nfc; return 0; err_af_nfc: nfc_llcp_exit(); err_llcp_sock: rawsock_exit(); err_rawsock: nfc_genl_exit(); err_genl: class_unregister(&nfc_class); return rc; } static void __exit nfc_exit(void) { af_nfc_exit(); nfc_llcp_exit(); rawsock_exit(); nfc_genl_exit(); class_unregister(&nfc_class); } subsys_initcall(nfc_init); module_exit(nfc_exit); MODULE_AUTHOR("Lauro Ramos Venancio <lauro.venancio@openbossa.org>"); MODULE_DESCRIPTION("NFC Core ver " VERSION); MODULE_VERSION(VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_NFC); MODULE_ALIAS_GENL_FAMILY(NFC_GENL_NAME);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoenet.c * Ethernet portion of AoE driver */ #include <linux/gfp.h> #include <linux/hdreg.h> #include <linux/blkdev.h> #include <linux/netdevice.h> #include <linux/moduleparam.h> #include <net/net_namespace.h> #include <asm/unaligned.h> #include "aoe.h" #define NECODES 5 static char *aoe_errlist[] = { "no such error", "unrecognized command code", "bad argument parameter", "device unavailable", "config string present", "unsupported version" }; enum { IFLISTSZ = 1024, }; static char aoe_iflist[IFLISTSZ]; module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600); MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]"); static wait_queue_head_t txwq; static struct ktstate kts; #ifndef MODULE static int __init aoe_iflist_setup(char *str) { strncpy(aoe_iflist, str, IFLISTSZ); aoe_iflist[IFLISTSZ - 1] = '\0'; return 1; } __setup("aoe_iflist=", aoe_iflist_setup); #endif static spinlock_t txlock; static struct sk_buff_head skbtxq; /* enters with txlock held */ static int tx(int id) __must_hold(&txlock) { struct sk_buff *skb; struct net_device *ifp; while ((skb = skb_dequeue(&skbtxq))) { spin_unlock_irq(&txlock); ifp = skb->dev; if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit()) pr_warn("aoe: packet could not be sent on %s. %s\n", ifp ? ifp->name : "netif", "consider increasing tx_queue_len"); spin_lock_irq(&txlock); } return 0; } int is_aoe_netif(struct net_device *ifp) { register char *p, *q; register int len; if (aoe_iflist[0] == '\0') return 1; p = aoe_iflist + strspn(aoe_iflist, WHITESPACE); for (; *p; p = q + strspn(q, WHITESPACE)) { q = p + strcspn(p, WHITESPACE); if (q != p) len = q - p; else len = strlen(p); /* last token in aoe_iflist */ if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len)) return 1; if (q == p) break; } return 0; } int set_aoe_iflist(const char __user *user_str, size_t size) { if (size >= IFLISTSZ) return -EINVAL; if (copy_from_user(aoe_iflist, user_str, size)) { printk(KERN_INFO "aoe: copy from user failed\n"); return -EFAULT; } aoe_iflist[size] = 0x00; return 0; } void aoenet_xmit(struct sk_buff_head *queue) { struct sk_buff *skb, *tmp; ulong flags; skb_queue_walk_safe(queue, skb, tmp) { __skb_unlink(skb, queue); spin_lock_irqsave(&txlock, flags); skb_queue_tail(&skbtxq, skb); spin_unlock_irqrestore(&txlock, flags); wake_up(&txwq); } } /* * (1) len doesn't include the header by default. I want this. */ static int aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev) { struct aoe_hdr *h; struct aoe_atahdr *ah; u32 n; int sn; if (dev_net(ifp) != &init_net) goto exit; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) return 0; if (!is_aoe_netif(ifp)) goto exit; skb_push(skb, ETH_HLEN); /* (1) */ sn = sizeof(*h) + sizeof(*ah); if (skb->len >= sn) { sn -= skb_headlen(skb); if (sn > 0 && !__pskb_pull_tail(skb, sn)) goto exit; } h = (struct aoe_hdr *) skb->data; n = get_unaligned_be32(&h->tag); if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31)) goto exit; if (h->verfl & AOEFL_ERR) { n = h->err; if (n > NECODES) n = 0; if (net_ratelimit()) printk(KERN_ERR "%s%d.%d@%s; ecode=%d '%s'\n", "aoe: error packet from ", get_unaligned_be16(&h->major), h->minor, skb->dev->name, h->err, aoe_errlist[n]); goto exit; } switch (h->cmd) { case AOECMD_ATA: /* ata_rsp may keep skb for later processing or give it back */ skb = aoecmd_ata_rsp(skb); break; case AOECMD_CFG: aoecmd_cfg_rsp(skb); break; default: if (h->cmd >= AOECMD_VEND_MIN) break; /* don't complain about vendor commands */ pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd); break; } if (!skb) return 0; exit: dev_kfree_skb(skb); return 0; } static struct packet_type aoe_pt __read_mostly = { .type = __constant_htons(ETH_P_AOE), .func = aoenet_rcv, }; int __init aoenet_init(void) { skb_queue_head_init(&skbtxq); init_waitqueue_head(&txwq); spin_lock_init(&txlock); kts.lock = &txlock; kts.fn = tx; kts.waitq = &txwq; kts.id = 0; snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id); if (aoe_ktstart(&kts)) return -EAGAIN; dev_add_pack(&aoe_pt); return 0; } void aoenet_exit(void) { aoe_ktstop(&kts); skb_queue_purge(&skbtxq); dev_remove_pack(&aoe_pt); }
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 5 5 4 3 2 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 /* * Longest prefix match list implementation * * Copyright (c) 2016,2017 Daniel Mack * Copyright (c) 2016 David Herrmann * * This file is subject to the terms and conditions of version 2 of the GNU * General Public License. See the file COPYING in the main directory of the * Linux distribution for more details. */ #include <linux/bpf.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <net/ipv6.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) struct lpm_trie_node; struct lpm_trie_node { struct rcu_head rcu; struct lpm_trie_node __rcu *child[2]; u32 prefixlen; u32 flags; u8 data[0]; }; struct lpm_trie { struct bpf_map map; struct lpm_trie_node __rcu *root; size_t n_entries; size_t max_prefixlen; size_t data_size; raw_spinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to * match IP addresses to a stored set of ranges. * * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is * interpreted as big endian, so data[0] stores the most significant byte. * * Match ranges are internally stored in instances of struct lpm_trie_node * which each contain their prefix length as well as two pointers that may * lead to more nodes containing more specific matches. Each node also stores * a value that is defined by and returned to userspace via the update_elem * and lookup functions. * * For instance, let's start with a trie that was created with a prefix length * of 32, so it can be used for IPv4 addresses, and one single element that * matches 192.168.0.0/16. The data array would hence contain * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will * stick to IP-address notation for readability though. * * As the trie is empty initially, the new node (1) will be places as root * node, denoted as (R) in the example below. As there are no other node, both * child pointers are %NULL. * * +----------------+ * | (1) (R) | * | 192.168.0.0/16 | * | value: 1 | * | [0] [1] | * +----------------+ * * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already * a node with the same data and a smaller prefix (ie, a less specific one), * node (2) will become a child of (1). In child index depends on the next bit * that is outside of what (1) matches, and that bit is 0, so (2) will be * child[0] of (1): * * +----------------+ * | (1) (R) | * | 192.168.0.0/16 | * | value: 1 | * | [0] [1] | * +----------------+ * | * +----------------+ * | (2) | * | 192.168.0.0/24 | * | value: 2 | * | [0] [1] | * +----------------+ * * The child[1] slot of (1) could be filled with another node which has bit #17 * (the next bit after the ones that (1) matches on) set to 1. For instance, * 192.168.128.0/24: * * +----------------+ * | (1) (R) | * | 192.168.0.0/16 | * | value: 1 | * | [0] [1] | * +----------------+ * | | * +----------------+ +------------------+ * | (2) | | (3) | * | 192.168.0.0/24 | | 192.168.128.0/24 | * | value: 2 | | value: 3 | * | [0] [1] | | [0] [1] | * +----------------+ +------------------+ * * Let's add another node (4) to the game for 192.168.1.0/24. In order to place * it, node (1) is looked at first, and because (4) of the semantics laid out * above (bit #17 is 0), it would normally be attached to (1) as child[0]. * However, that slot is already allocated, so a new node is needed in between. * That node does not have a value attached to it and it will never be * returned to users as result of a lookup. It is only there to differentiate * the traversal further. It will get a prefix as wide as necessary to * distinguish its two children: * * +----------------+ * | (1) (R) | * | 192.168.0.0/16 | * | value: 1 | * | [0] [1] | * +----------------+ * | | * +----------------+ +------------------+ * | (4) (I) | | (3) | * | 192.168.0.0/23 | | 192.168.128.0/24 | * | value: --- | | value: 3 | * | [0] [1] | | [0] [1] | * +----------------+ +------------------+ * | | * +----------------+ +----------------+ * | (2) | | (5) | * | 192.168.0.0/24 | | 192.168.1.0/24 | * | value: 2 | | value: 5 | * | [0] [1] | | [0] [1] | * +----------------+ +----------------+ * * 192.168.1.1/32 would be a child of (5) etc. * * An intermediate node will be turned into a 'real' node on demand. In the * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. * * A fully populated trie would have a height of 32 nodes, as the trie was * created with a prefix length of 32. * * The lookup starts at the root node. If the current node matches and if there * is a child that can be used to become more specific, the trie is traversed * downwards. The last node in the traversal that is a non-intermediate one is * returned. */ static inline int extract_bit(const u8 *data, size_t index) { return !!(data[index / 8] & (1 << (7 - (index % 8)))); } /** * longest_prefix_match() - determine the longest prefix * @trie: The trie to get internal sizes from * @node: The node to operate on * @key: The key to compare to @node * * Determine the longest prefix of @node that matches the bits in @key. */ static size_t longest_prefix_match(const struct lpm_trie *trie, const struct lpm_trie_node *node, const struct bpf_lpm_trie_key *key) { size_t prefixlen = 0; size_t i; for (i = 0; i < trie->data_size; i++) { size_t b; b = 8 - fls(node->data[i] ^ key->data[i]); prefixlen += b; if (prefixlen >= node->prefixlen || prefixlen >= key->prefixlen) return min(node->prefixlen, key->prefixlen); if (b < 8) break; } return prefixlen; } /* Called from syscall or from eBPF program */ static void *trie_lookup_elem(struct bpf_map *map, void *_key) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *found = NULL; struct bpf_lpm_trie_key *key = _key; /* Start walking the trie from the root node ... */ for (node = rcu_dereference(trie->root); node;) { unsigned int next_bit; size_t matchlen; /* Determine the longest prefix of @node that matches @key. * If it's the maximum possible prefix for this trie, we have * an exact match and can return it directly. */ matchlen = longest_prefix_match(trie, node, key); if (matchlen == trie->max_prefixlen) { found = node; break; } /* If the number of bits that match is smaller than the prefix * length of @node, bail out and return the node we have seen * last in the traversal (ie, the parent). */ if (matchlen < node->prefixlen) break; /* Consider this node as return candidate unless it is an * artificially added intermediate one. */ if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) found = node; /* If the node match is fully satisfied, let's see if we can * become more specific. Determine the next bit in the key and * traverse down. */ next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference(node->child[next_bit]); } if (!found) return NULL; return found->data + trie->data_size; } static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, const void *value) { struct lpm_trie_node *node; size_t size = sizeof(struct lpm_trie_node) + trie->data_size; if (value) size += trie->map.value_size; node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, trie->map.numa_node); if (!node) return NULL; node->flags = 0; if (value) memcpy(node->data + trie->data_size, value, trie->map.value_size); return node; } /* Called from syscall or from eBPF program */ static int trie_update_elem(struct bpf_map *map, void *_key, void *value, u64 flags) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key *key = _key; unsigned long irq_flags; unsigned int next_bit; size_t matchlen = 0; int ret = 0; if (unlikely(flags > BPF_EXIST)) return -EINVAL; if (key->prefixlen > trie->max_prefixlen) return -EINVAL; raw_spin_lock_irqsave(&trie->lock, irq_flags); /* Allocate and fill a new node */ if (trie->n_entries == trie->map.max_entries) { ret = -ENOSPC; goto out; } new_node = lpm_trie_node_alloc(trie, value); if (!new_node) { ret = -ENOMEM; goto out; } trie->n_entries++; new_node->prefixlen = key->prefixlen; RCU_INIT_POINTER(new_node->child[0], NULL); RCU_INIT_POINTER(new_node->child[1], NULL); memcpy(new_node->data, key->data, trie->data_size); /* Now find a slot to attach the new node. To do that, walk the tree * from the root and match as many bits as possible for each node until * we either find an empty slot or a slot that needs to be replaced by * an intermediate node. */ slot = &trie->root; while ((node = rcu_dereference_protected(*slot, lockdep_is_held(&trie->lock)))) { matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || node->prefixlen == key->prefixlen || node->prefixlen == trie->max_prefixlen) break; next_bit = extract_bit(key->data, node->prefixlen); slot = &node->child[next_bit]; } /* If the slot is empty (a free child pointer or an empty root), * simply assign the @new_node to that slot and be done. */ if (!node) { rcu_assign_pointer(*slot, new_node); goto out; } /* If the slot we picked already exists, replace it with @new_node * which already has the correct data array set. */ if (node->prefixlen == matchlen) { new_node->child[0] = node->child[0]; new_node->child[1] = node->child[1]; if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) trie->n_entries--; rcu_assign_pointer(*slot, new_node); kfree_rcu(node, rcu); goto out; } /* If the new node matches the prefix completely, it must be inserted * as an ancestor. Simply insert it between @node and *@slot. */ if (matchlen == key->prefixlen) { next_bit = extract_bit(node->data, matchlen); rcu_assign_pointer(new_node->child[next_bit], node); rcu_assign_pointer(*slot, new_node); goto out; } im_node = lpm_trie_node_alloc(trie, NULL); if (!im_node) { ret = -ENOMEM; goto out; } im_node->prefixlen = matchlen; im_node->flags |= LPM_TREE_NODE_FLAG_IM; memcpy(im_node->data, node->data, trie->data_size); /* Now determine which child to install in which slot */ if (extract_bit(key->data, matchlen)) { rcu_assign_pointer(im_node->child[0], node); rcu_assign_pointer(im_node->child[1], new_node); } else { rcu_assign_pointer(im_node->child[0], new_node); rcu_assign_pointer(im_node->child[1], node); } /* Finally, assign the intermediate node to the determined spot */ rcu_assign_pointer(*slot, im_node); out: if (ret) { if (new_node) trie->n_entries--; kfree(new_node); kfree(im_node); } raw_spin_unlock_irqrestore(&trie->lock, irq_flags); return ret; } static int trie_delete_elem(struct bpf_map *map, void *key) { /* TODO */ return -ENOSYS; } #define LPM_DATA_SIZE_MAX 256 #define LPM_DATA_SIZE_MIN 1 #define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \ sizeof(struct lpm_trie_node)) #define LPM_VAL_SIZE_MIN 1 #define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) #define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; u64 cost = sizeof(*trie), cost_per_node; int ret; if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); /* check sanity of attributes */ if (attr->max_entries == 0 || !(attr->map_flags & BPF_F_NO_PREALLOC) || attr->map_flags & ~LPM_CREATE_FLAG_MASK || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || attr->value_size > LPM_VAL_SIZE_MAX) return ERR_PTR(-EINVAL); trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); if (!trie) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ trie->map.map_type = attr->map_type; trie->map.key_size = attr->key_size; trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; trie->map.map_flags = attr->map_flags; trie->map.numa_node = bpf_map_attr_numa_node(attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; cost_per_node = sizeof(struct lpm_trie_node) + attr->value_size + trie->data_size; cost += (u64) attr->max_entries * cost_per_node; if (cost >= U32_MAX - PAGE_SIZE) { ret = -E2BIG; goto out_err; } trie->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; ret = bpf_map_precharge_memlock(trie->map.pages); if (ret) goto out_err; raw_spin_lock_init(&trie->lock); return &trie->map; out_err: kfree(trie); return ERR_PTR(ret); } static void trie_free(struct bpf_map *map) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct lpm_trie_node __rcu **slot; struct lpm_trie_node *node; /* Wait for outstanding programs to complete * update/lookup/delete/get_next_key and free the trie. */ synchronize_rcu(); /* Always start at the root and walk down to a node that has no * children. Then free that node, nullify its reference in the parent * and start over. */ for (;;) { slot = &trie->root; for (;;) { node = rcu_dereference_protected(*slot, 1); if (!node) goto out; if (rcu_access_pointer(node->child[0])) { slot = &node->child[0]; continue; } if (rcu_access_pointer(node->child[1])) { slot = &node->child[1]; continue; } kfree(node); RCU_INIT_POINTER(*slot, NULL); break; } } out: kfree(trie); } static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) { return -ENOTSUPP; } const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, .map_get_next_key = trie_get_next_key, .map_lookup_elem = trie_lookup_elem, .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, };
7 1 7 7 7 10 6 4 3 9 6 6 1 4 3 3 1 3 1 1 1 3 3 3 3 3 7 7 7 7 7 7 10 11 11 10 3 7 6 6 7 4 11 4 3 2 2 1 1 1 1 2 4 1 2 2 2 4 2 2 4 11 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 /* * linux/fs/sysv/inode.c * * minix/inode.c * Copyright (C) 1991, 1992 Linus Torvalds * * xenix/inode.c * Copyright (C) 1992 Doug Evans * * coh/inode.c * Copyright (C) 1993 Pascal Haible, Bruno Haible * * sysv/inode.c * Copyright (C) 1993 Paul B. Monday * * sysv/inode.c * Copyright (C) 1993 Bruno Haible * Copyright (C) 1997, 1998 Krzysztof G. Baranowski * * This file contains code for read/parsing the superblock. */ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/buffer_head.h> #include "sysv.h" /* * The following functions try to recognize specific filesystems. * * We recognize: * - Xenix FS by its magic number. * - SystemV FS by its magic number. * - Coherent FS by its funny fname/fpack field. * - SCO AFS by s_nfree == 0xffff * - V7 FS has no distinguishing features. * * We discriminate among SystemV4 and SystemV2 FS by the assumption that * the time stamp is not < 01-01-1980. */ enum { JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60 }; static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links) { struct buffer_head *bh1 = sbi->s_bh1; struct buffer_head *bh2 = sbi->s_bh2; struct xenix_super_block * sbd1; struct xenix_super_block * sbd2; if (bh1 != bh2) sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data; else { /* block size = 512, so bh1 != bh2 */ sbd1 = (struct xenix_super_block *) bh1->b_data; sbd2 = (struct xenix_super_block *) (bh2->b_data - 512); } *max_links = XENIX_LINK_MAX; sbi->s_fic_size = XENIX_NICINOD; sbi->s_flc_size = XENIX_NICFREE; sbi->s_sbd1 = (char *)sbd1; sbi->s_sbd2 = (char *)sbd2; sbi->s_sb_fic_count = &sbd1->s_ninode; sbi->s_sb_fic_inodes = &sbd1->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd2->s_tinode; sbi->s_bcache_count = &sbd1->s_nfree; sbi->s_bcache = &sbd1->s_free[0]; sbi->s_free_blocks = &sbd2->s_tfree; sbi->s_sb_time = &sbd2->s_time; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize); } static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links) { struct sysv4_super_block * sbd; struct buffer_head *bh1 = sbi->s_bh1; struct buffer_head *bh2 = sbi->s_bh2; if (bh1 == bh2) sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2); else sbd = (struct sysv4_super_block *) bh2->b_data; *max_links = SYSV_LINK_MAX; sbi->s_fic_size = SYSV_NICINOD; sbi->s_flc_size = SYSV_NICFREE; sbi->s_sbd1 = (char *)sbd; sbi->s_sbd2 = (char *)sbd; sbi->s_sb_fic_count = &sbd->s_ninode; sbi->s_sb_fic_inodes = &sbd->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd->s_tinode; sbi->s_bcache_count = &sbd->s_nfree; sbi->s_bcache = &sbd->s_free[0]; sbi->s_free_blocks = &sbd->s_tfree; sbi->s_sb_time = &sbd->s_time; sbi->s_sb_state = &sbd->s_state; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); } static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links) { struct sysv2_super_block *sbd; struct buffer_head *bh1 = sbi->s_bh1; struct buffer_head *bh2 = sbi->s_bh2; if (bh1 == bh2) sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2); else sbd = (struct sysv2_super_block *) bh2->b_data; *max_links = SYSV_LINK_MAX; sbi->s_fic_size = SYSV_NICINOD; sbi->s_flc_size = SYSV_NICFREE; sbi->s_sbd1 = (char *)sbd; sbi->s_sbd2 = (char *)sbd; sbi->s_sb_fic_count = &sbd->s_ninode; sbi->s_sb_fic_inodes = &sbd->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd->s_tinode; sbi->s_bcache_count = &sbd->s_nfree; sbi->s_bcache = &sbd->s_free[0]; sbi->s_free_blocks = &sbd->s_tfree; sbi->s_sb_time = &sbd->s_time; sbi->s_sb_state = &sbd->s_state; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); } static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links) { struct coh_super_block * sbd; struct buffer_head *bh1 = sbi->s_bh1; sbd = (struct coh_super_block *) bh1->b_data; *max_links = COH_LINK_MAX; sbi->s_fic_size = COH_NICINOD; sbi->s_flc_size = COH_NICFREE; sbi->s_sbd1 = (char *)sbd; sbi->s_sbd2 = (char *)sbd; sbi->s_sb_fic_count = &sbd->s_ninode; sbi->s_sb_fic_inodes = &sbd->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd->s_tinode; sbi->s_bcache_count = &sbd->s_nfree; sbi->s_bcache = &sbd->s_free[0]; sbi->s_free_blocks = &sbd->s_tfree; sbi->s_sb_time = &sbd->s_time; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); } static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links) { struct buffer_head *bh2 = sbi->s_bh2; struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data; *max_links = V7_LINK_MAX; sbi->s_fic_size = V7_NICINOD; sbi->s_flc_size = V7_NICFREE; sbi->s_sbd1 = (char *)sbd; sbi->s_sbd2 = (char *)sbd; sbi->s_sb_fic_count = &sbd->s_ninode; sbi->s_sb_fic_inodes = &sbd->s_inode[0]; sbi->s_sb_total_free_inodes = &sbd->s_tinode; sbi->s_bcache_count = &sbd->s_nfree; sbi->s_bcache = &sbd->s_free[0]; sbi->s_free_blocks = &sbd->s_tfree; sbi->s_sb_time = &sbd->s_time; sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); } static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh) { struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data; if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544)) sbi->s_bytesex = BYTESEX_LE; else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544)) sbi->s_bytesex = BYTESEX_BE; else return 0; switch (fs32_to_cpu(sbi, sbd->s_type)) { case 1: sbi->s_type = FSTYPE_XENIX; return 1; case 2: sbi->s_type = FSTYPE_XENIX; return 2; default: return 0; } } static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh) { struct super_block *sb = sbi->s_sb; /* All relevant fields are at the same offsets in R2 and R4 */ struct sysv4_super_block * sbd; u32 type; sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2); if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20)) sbi->s_bytesex = BYTESEX_LE; else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20)) sbi->s_bytesex = BYTESEX_BE; else return 0; type = fs32_to_cpu(sbi, sbd->s_type); if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) { sbi->s_type = FSTYPE_AFS; sbi->s_forced_ro = 1; if (!sb_rdonly(sb)) { printk("SysV FS: SCO EAFS on %s detected, " "forcing read-only mode.\n", sb->s_id); } return type; } if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) { /* this is likely to happen on SystemV2 FS */ if (type > 3 || type < 1) return 0; sbi->s_type = FSTYPE_SYSV2; return type; } if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10)) return 0; /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10, 0x20 or 0x30 indicates that symbolic links and the 14-character filename limit is gone. Due to lack of information about this feature read-only mode seems to be a reasonable approach... -KGB */ if (type >= 0x10) { printk("SysV FS: can't handle long file names on %s, " "forcing read-only mode.\n", sb->s_id); sbi->s_forced_ro = 1; } sbi->s_type = FSTYPE_SYSV4; return type >= 0x10 ? type >> 4 : type; } static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh) { struct coh_super_block * sbd; sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2); if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6)) || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6))) return 0; sbi->s_bytesex = BYTESEX_PDP; sbi->s_type = FSTYPE_COH; return 1; } static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh) { int size = detect_sysv(sbi, bh); return size>2 ? 0 : size; } static struct { int block; int (*test)(struct sysv_sb_info *, struct buffer_head *); } flavours[] = { {1, detect_xenix}, {0, detect_sysv}, {0, detect_coherent}, {9, detect_sysv_odd}, {15,detect_sysv_odd}, {18,detect_sysv}, }; static char *flavour_names[] = { [FSTYPE_XENIX] = "Xenix", [FSTYPE_SYSV4] = "SystemV", [FSTYPE_SYSV2] = "SystemV Release 2", [FSTYPE_COH] = "Coherent", [FSTYPE_V7] = "V7", [FSTYPE_AFS] = "AFS", }; static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = { [FSTYPE_XENIX] = detected_xenix, [FSTYPE_SYSV4] = detected_sysv4, [FSTYPE_SYSV2] = detected_sysv2, [FSTYPE_COH] = detected_coherent, [FSTYPE_V7] = detected_v7, [FSTYPE_AFS] = detected_sysv4, }; static int complete_read_super(struct super_block *sb, int silent, int size) { struct sysv_sb_info *sbi = SYSV_SB(sb); struct inode *root_inode; char *found = flavour_names[sbi->s_type]; u_char n_bits = size+8; int bsize = 1 << n_bits; int bsize_4 = bsize >> 2; sbi->s_firstinodezone = 2; flavour_setup[sbi->s_type](sbi, &sb->s_max_links); sbi->s_truncate = 1; sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; sbi->s_inodes_per_block = bsize >> 6; sbi->s_inodes_per_block_1 = (bsize >> 6)-1; sbi->s_inodes_per_block_bits = n_bits-6; sbi->s_ind_per_block = bsize_4; sbi->s_ind_per_block_2 = bsize_4*bsize_4; sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4)); sbi->s_ind_per_block_bits = n_bits-2; sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone) << sbi->s_inodes_per_block_bits; if (!silent) printk("VFS: Found a %s FS (block size = %ld) on device %s\n", found, sb->s_blocksize, sb->s_id); sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; /* set up enough so that it can read an inode */ sb->s_op = &sysv_sops; if (sbi->s_forced_ro) sb->s_flags |= MS_RDONLY; if (sbi->s_truncate) sb->s_d_op = &sysv_dentry_operations; root_inode = sysv_iget(sb, SYSV_ROOT_INO); if (IS_ERR(root_inode)) { printk("SysV FS: get root inode failed\n"); return 0; } sb->s_root = d_make_root(root_inode); if (!sb->s_root) { printk("SysV FS: get root dentry failed\n"); return 0; } return 1; } static int sysv_fill_super(struct super_block *sb, void *data, int silent) { struct buffer_head *bh1, *bh = NULL; struct sysv_sb_info *sbi; unsigned long blocknr; int size = 0, i; BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block)); BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block)); BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block)); BUILD_BUG_ON(500 != sizeof (struct coh_super_block)); BUILD_BUG_ON(64 != sizeof (struct sysv_inode)); sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sbi->s_sb = sb; sbi->s_block_base = 0; mutex_init(&sbi->s_lock); sb->s_fs_info = sbi; sb_set_blocksize(sb, BLOCK_SIZE); for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) { brelse(bh); bh = sb_bread(sb, flavours[i].block); if (!bh) continue; size = flavours[i].test(SYSV_SB(sb), bh); } if (!size) goto Eunknown; switch (size) { case 1: blocknr = bh->b_blocknr << 1; brelse(bh); sb_set_blocksize(sb, 512); bh1 = sb_bread(sb, blocknr); bh = sb_bread(sb, blocknr + 1); break; case 2: bh1 = bh; break; case 3: blocknr = bh->b_blocknr >> 1; brelse(bh); sb_set_blocksize(sb, 2048); bh1 = bh = sb_bread(sb, blocknr); break; default: goto Ebadsize; } if (bh && bh1) { sbi->s_bh1 = bh1; sbi->s_bh2 = bh; if (complete_read_super(sb, silent, size)) return 0; } brelse(bh1); brelse(bh); sb_set_blocksize(sb, BLOCK_SIZE); printk("oldfs: cannot read superblock\n"); failed: kfree(sbi); return -EINVAL; Eunknown: brelse(bh); if (!silent) printk("VFS: unable to find oldfs superblock on device %s\n", sb->s_id); goto failed; Ebadsize: brelse(bh); if (!silent) printk("VFS: oldfs: unsupported block size (%dKb)\n", 1<<(size-2)); goto failed; } static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh) { struct v7_super_block *v7sb; struct sysv_inode *v7i; struct buffer_head *bh2; struct sysv_sb_info *sbi; sbi = sb->s_fs_info; /* plausibility check on superblock */ v7sb = (struct v7_super_block *) bh->b_data; if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE) return 0; /* plausibility check on root inode: it is a directory, with a nonzero size that is a multiple of 16 */ bh2 = sb_bread(sb, 2); if (bh2 == NULL) return 0; v7i = (struct sysv_inode *)(bh2->b_data + 64); if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR || (fs32_to_cpu(sbi, v7i->i_size) == 0) || (fs32_to_cpu(sbi, v7i->i_size) & 017) || (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES * sizeof(struct sysv_dir_entry))) { brelse(bh2); return 0; } brelse(bh2); return 1; } static int v7_fill_super(struct super_block *sb, void *data, int silent) { struct sysv_sb_info *sbi; struct buffer_head *bh; if (440 != sizeof (struct v7_super_block)) panic("V7 FS: bad super-block size"); if (64 != sizeof (struct sysv_inode)) panic("sysv fs: bad i-node size"); sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; sbi->s_sb = sb; sbi->s_block_base = 0; sbi->s_type = FSTYPE_V7; mutex_init(&sbi->s_lock); sb->s_fs_info = sbi; sb_set_blocksize(sb, 512); if ((bh = sb_bread(sb, 1)) == NULL) { if (!silent) printk("VFS: unable to read V7 FS superblock on " "device %s.\n", sb->s_id); goto failed; } /* Try PDP-11 UNIX */ sbi->s_bytesex = BYTESEX_PDP; if (v7_sanity_check(sb, bh)) goto detected; /* Try PC/IX, v7/x86 */ sbi->s_bytesex = BYTESEX_LE; if (v7_sanity_check(sb, bh)) goto detected; goto failed; detected: sbi->s_bh1 = bh; sbi->s_bh2 = bh; if (complete_read_super(sb, silent, 1)) return 0; failed: printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n", sb->s_id); brelse(bh); kfree(sbi); return -EINVAL; } /* Every kernel module contains stuff like this. */ static struct dentry *sysv_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); } static struct dentry *v7_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); } static struct file_system_type sysv_fs_type = { .owner = THIS_MODULE, .name = "sysv", .mount = sysv_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("sysv"); static struct file_system_type v7_fs_type = { .owner = THIS_MODULE, .name = "v7", .mount = v7_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("v7"); MODULE_ALIAS("v7"); static int __init init_sysv_fs(void) { int error; error = sysv_init_icache(); if (error) goto out; error = register_filesystem(&sysv_fs_type); if (error) goto destroy_icache; error = register_filesystem(&v7_fs_type); if (error) goto unregister; return 0; unregister: unregister_filesystem(&sysv_fs_type); destroy_icache: sysv_destroy_icache(); out: return error; } static void __exit exit_sysv_fs(void) { unregister_filesystem(&sysv_fs_type); unregister_filesystem(&v7_fs_type); sysv_destroy_icache(); } module_init(init_sysv_fs) module_exit(exit_sysv_fs) MODULE_LICENSE("GPL");
43 65 65 65 13 64 65 7 107 102 107 68 4 2 65 18 3 65 12 46 14 16 16 41 60 11 11 107 60 14 60 82 82 82 1 12 12 9 12 82 82 82 82 81 82 82 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 /* * Performance events ring-buffer code: * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING */ #include <linux/perf_event.h> #include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/circ_buf.h> #include <linux/poll.h> #include <linux/nospec.h> #include "internal.h" static void perf_output_wakeup(struct perf_output_handle *handle) { atomic_set(&handle->rb->poll, POLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending); } /* * We need to ensure a later event_id doesn't publish a head when a former * event isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * * We only publish the head (and generate a wakeup) when the outer-most * event completes. */ static void perf_output_get_handle(struct perf_output_handle *handle) { struct ring_buffer *rb = handle->rb; preempt_disable(); local_inc(&rb->nest); handle->wakeup = local_read(&rb->wakeup); } static void perf_output_put_handle(struct perf_output_handle *handle) { struct ring_buffer *rb = handle->rb; unsigned long head; again: /* * In order to avoid publishing a head value that goes backwards, * we must ensure the load of @rb->head happens after we've * incremented @rb->nest. * * Otherwise we can observe a @rb->head value before one published * by an IRQ/NMI happening between the load and the increment. */ barrier(); head = local_read(&rb->head); /* * IRQ/NMI can happen here and advance @rb->head, causing our * load above to be stale. */ /* * If this isn't the outermost nesting, we don't have to update * @rb->user_page->data_head. */ if (local_read(&rb->nest) > 1) { local_dec(&rb->nest); goto out; } /* * Since the mmap() consumer (userspace) can run on a different CPU: * * kernel user * * if (LOAD ->data_tail) { LOAD ->data_head * (A) smp_rmb() (C) * STORE $data LOAD $data * smp_wmb() (B) smp_mb() (D) * STORE ->data_head STORE ->data_tail * } * * Where A pairs with D, and B pairs with C. * * In our case (A) is a control dependency that separates the load of * the ->data_tail and the stores of $data. In case ->data_tail * indicates there is no room in the buffer to store $data we do not. * * D needs to be a full barrier since it separates the data READ * from the tail WRITE. * * For B a WMB is sufficient since it separates two WRITEs, and for C * an RMB is sufficient since it separates two READs. * * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ WRITE_ONCE(rb->user_page->data_head, head); /* * We must publish the head before decrementing the nest count, * otherwise an IRQ/NMI can publish a more recent head value and our * write will (temporarily) publish a stale value. */ barrier(); local_set(&rb->nest, 0); /* * Ensure we decrement @rb->nest before we validate the @rb->head. * Otherwise we cannot be sure we caught the 'last' nested update. */ barrier(); if (unlikely(head != local_read(&rb->head))) { local_inc(&rb->nest); goto again; } if (handle->wakeup != local_read(&rb->wakeup)) perf_output_wakeup(handle); out: preempt_enable(); } static bool __always_inline ring_buffer_has_space(unsigned long head, unsigned long tail, unsigned long data_size, unsigned int size, bool backward) { if (!backward) return CIRC_SPACE(head, tail, data_size) >= size; else return CIRC_SPACE(tail, head, data_size) >= size; } static int __always_inline __perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, bool backward) { struct ring_buffer *rb; unsigned long tail, offset, head; int have_lost, page_shift; struct { struct perf_event_header header; u64 id; u64 lost; } lost_event; rcu_read_lock(); /* * For inherited events we send all the output towards the parent. */ if (event->parent) event = event->parent; rb = rcu_dereference(event->rb); if (unlikely(!rb)) goto out; if (unlikely(rb->paused)) { if (rb->nr_pages) local_inc(&rb->lost); goto out; } handle->rb = rb; handle->event = event; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { size += sizeof(lost_event); if (event->attr.sample_id_all) size += event->id_header_size; } perf_output_get_handle(handle); do { tail = READ_ONCE(rb->user_page->data_tail); offset = head = local_read(&rb->head); if (!rb->overwrite) { if (unlikely(!ring_buffer_has_space(head, tail, perf_data_size(rb), size, backward))) goto fail; } /* * The above forms a control dependency barrier separating the * @tail load above from the data stores below. Since the @tail * load is required to compute the branch to fail below. * * A, matches D; the full memory barrier userspace SHOULD issue * after reading the data and before storing the new tail * position. * * See perf_output_put_handle(). */ if (!backward) head += size; else head -= size; } while (local_cmpxchg(&rb->head, offset, head) != offset); if (backward) { offset = head; head = (u64)(-head); } /* * We rely on the implied barrier() by local_cmpxchg() to ensure * none of the data stores below can be lifted up by the compiler. */ if (unlikely(head - local_read(&rb->wakeup) > rb->watermark)) local_add(rb->watermark, &rb->wakeup); page_shift = PAGE_SHIFT + page_order(rb); handle->page = (offset >> page_shift) & (rb->nr_pages - 1); offset &= (1UL << page_shift) - 1; handle->addr = rb->data_pages[handle->page] + offset; handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { struct perf_sample_data sample_data; lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); perf_event_header__init_id(&lost_event.header, &sample_data, event); perf_output_put(handle, lost_event); perf_event__output_id_sample(event, handle, &sample_data); } return 0; fail: local_inc(&rb->lost); perf_output_put_handle(handle); out: rcu_read_unlock(); return -ENOSPC; } int perf_output_begin_forward(struct perf_output_handle *handle, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, event, size, false); } int perf_output_begin_backward(struct perf_output_handle *handle, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, event, size, true); } int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size) { return __perf_output_begin(handle, event, size, unlikely(is_write_backward(event))); } unsigned int perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { return __output_copy(handle, buf, len); } unsigned int perf_output_skip(struct perf_output_handle *handle, unsigned int len) { return __output_skip(handle, NULL, len); } void perf_output_end(struct perf_output_handle *handle) { perf_output_put_handle(handle); rcu_read_unlock(); } static void ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) { long max_size = perf_data_size(rb); if (watermark) rb->watermark = min(max_size, watermark); if (!rb->watermark) rb->watermark = max_size / 2; if (flags & RING_BUFFER_WRITABLE) rb->overwrite = 0; else rb->overwrite = 1; atomic_set(&rb->refcount, 1); INIT_LIST_HEAD(&rb->event_list); spin_lock_init(&rb->event_lock); /* * perf_output_begin() only checks rb->paused, therefore * rb->paused must be true if we have no pages for output. */ if (!rb->nr_pages) rb->paused = 1; } void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) { /* * OVERWRITE is determined by perf_aux_output_end() and can't * be passed in directly. */ if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) return; handle->aux_flags |= flags; } EXPORT_SYMBOL_GPL(perf_aux_output_flag); /* * This is called before hardware starts writing to the AUX area to * obtain an output handle and make sure there's room in the buffer. * When the capture completes, call perf_aux_output_end() to commit * the recorded data to the buffer. * * The ordering is similar to that of perf_output_{begin,end}, with * the exception of (B), which should be taken care of by the pmu * driver, since ordering rules will differ depending on hardware. * * Call this from pmu::start(); see the comment in perf_aux_output_end() * about its use in pmu callbacks. Both can also be called from the PMI * handler if needed. */ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { struct perf_event *output_event = event; unsigned long aux_head, aux_tail; struct ring_buffer *rb; if (output_event->parent) output_event = output_event->parent; /* * Since this will typically be open across pmu::add/pmu::del, we * grab ring_buffer's refcount instead of holding rcu read lock * to make sure it doesn't disappear under us. */ rb = ring_buffer_get(output_event); if (!rb) return NULL; if (!rb_has_aux(rb)) goto err; /* * If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(), * about to get freed, so we leave immediately. * * Checking rb::aux_mmap_count and rb::refcount has to be done in * the same order, see perf_mmap_close. Otherwise we end up freeing * aux pages in this path, which is a bug, because in_atomic(). */ if (!atomic_read(&rb->aux_mmap_count)) goto err; if (!atomic_inc_not_zero(&rb->aux_refcount)) goto err; /* * Nesting is not supported for AUX area, make sure nested * writers are caught early */ if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) goto err_put; aux_head = rb->aux_head; handle->rb = rb; handle->event = event; handle->head = aux_head; handle->size = 0; handle->aux_flags = 0; /* * In overwrite mode, AUX data stores do not depend on aux_tail, * therefore (A) control dependency barrier does not exist. The * (B) <-> (C) ordering is still observed by the pmu driver. */ if (!rb->aux_overwrite) { aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); /* * handle->size computation depends on aux_tail load; this forms a * control dependency barrier separating aux_tail load from aux data * store that will be enabled on successful return */ if (!handle->size) { /* A, matches D */ event->pending_disable = 1; perf_output_wakeup(handle); local_set(&rb->aux_nest, 0); goto err_put; } } return handle->rb->aux_priv; err_put: /* can't be last */ rb_free_aux(rb); err: ring_buffer_put(rb); handle->event = NULL; return NULL; } static bool __always_inline rb_need_aux_wakeup(struct ring_buffer *rb) { if (rb->aux_overwrite) return false; if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) { rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark); return true; } return false; } /* * Commit the data written by hardware into the ring buffer by adjusting * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the * pmu driver's responsibility to observe ordering rules of the hardware, * so that all the data is externally visible before this is called. * * Note: this has to be called from pmu::stop() callback, as the assumption * of the AUX buffer management code is that after pmu::stop(), the AUX * transaction must be stopped and therefore drop the AUX reference count. */ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); struct ring_buffer *rb = handle->rb; unsigned long aux_head; /* in overwrite mode, driver provides aux_head via handle */ if (rb->aux_overwrite) { handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; rb->aux_head = aux_head; } else { handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; aux_head = rb->aux_head; rb->aux_head += size; } if (size || handle->aux_flags) { /* * Only send RECORD_AUX if we have something useful to communicate */ perf_event_aux_event(handle->event, aux_head, size, handle->aux_flags); } WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; if (wakeup) { if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) handle->event->pending_disable = 1; perf_output_wakeup(handle); } handle->event = NULL; local_set(&rb->aux_nest, 0); /* can't be last */ rb_free_aux(rb); ring_buffer_put(rb); } /* * Skip over a given number of bytes in the AUX buffer, due to, for example, * hardware's alignment constraints. */ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { struct ring_buffer *rb = handle->rb; if (size > handle->size) return -ENOSPC; rb->aux_head += size; WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark; } handle->head = rb->aux_head; handle->size -= size; return 0; } void *perf_get_aux(struct perf_output_handle *handle) { /* this is only valid between perf_aux_output_begin and *_end */ if (!handle->event) return NULL; return handle->rb->aux_priv; } #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) { struct page *page; if (order > MAX_ORDER) order = MAX_ORDER; do { page = alloc_pages_node(node, PERF_AUX_GFP, order); } while (!page && order--); if (page && order) { /* * Communicate the allocation size to the driver: * if we managed to secure a high-order allocation, * set its first page's private to this order; * !PagePrivate(page) means it's just a normal page. */ split_page(page, order); SetPagePrivate(page); set_page_private(page, order); } return page; } static void rb_free_aux_page(struct ring_buffer *rb, int idx) { struct page *page = virt_to_page(rb->aux_pages[idx]); ClearPagePrivate(page); page->mapping = NULL; __free_page(page); } static void __rb_free_aux(struct ring_buffer *rb) { int pg; /* * Should never happen, the last reference should be dropped from * perf_mmap_close() path, which first stops aux transactions (which * in turn are the atomic holders of aux_refcount) and then does the * last rb_free_aux(). */ WARN_ON_ONCE(in_atomic()); if (rb->aux_priv) { rb->free_aux(rb->aux_priv); rb->free_aux = NULL; rb->aux_priv = NULL; } if (rb->aux_nr_pages) { for (pg = 0; pg < rb->aux_nr_pages; pg++) rb_free_aux_page(rb, pg); kfree(rb->aux_pages); rb->aux_nr_pages = 0; } } int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); int ret = -ENOMEM, max_order = 0; if (!has_aux(event)) return -EOPNOTSUPP; if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { /* * We need to start with the max_order that fits in nr_pages, * not the other way around, hence ilog2() and not get_order. */ max_order = ilog2(nr_pages); /* * PMU requests more than one contiguous chunks of memory * for SW double buffering */ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && !overwrite) { if (!max_order) return -EINVAL; max_order--; } } rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); if (!rb->aux_pages) return -ENOMEM; rb->free_aux = event->pmu->free_aux; for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { struct page *page; int last, order; order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); page = rb_alloc_aux_page(node, order); if (!page) goto out; for (last = rb->aux_nr_pages + (1 << page_private(page)); last > rb->aux_nr_pages; rb->aux_nr_pages++) rb->aux_pages[rb->aux_nr_pages] = page_address(page++); } /* * In overwrite mode, PMUs that don't support SG may not handle more * than one contiguous allocation, since they rely on PMI to do double * buffering. In this case, the entire buffer has to be one contiguous * chunk. */ if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) && overwrite) { struct page *page = virt_to_page(rb->aux_pages[0]); if (page_private(page) != max_order) goto out; } rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, overwrite); if (!rb->aux_priv) goto out; ret = 0; /* * aux_pages (and pmu driver's private data, aux_priv) will be * referenced in both producer's and consumer's contexts, thus * we keep a refcount here to make sure either of the two can * reference them safely. */ atomic_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; rb->aux_watermark = watermark; if (!rb->aux_watermark && !rb->aux_overwrite) rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); out: if (!ret) rb->aux_pgoff = pgoff; else __rb_free_aux(rb); return ret; } void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) __rb_free_aux(rb); } #ifndef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with regular GFP_KERNEL-0 pages. */ static struct page * __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; if (pgoff == 0) return virt_to_page(rb->user_page); return virt_to_page(rb->data_pages[pgoff - 1]); } static void *perf_mmap_alloc_page(int cpu) { struct page *page; int node; node = (cpu == -1) ? cpu : cpu_to_node(cpu); page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) return NULL; return page_address(page); } struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct ring_buffer *rb; unsigned long size; int i; size = sizeof(struct ring_buffer); size += nr_pages * sizeof(void *); if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) goto fail; rb = kzalloc(size, GFP_KERNEL); if (!rb) goto fail; rb->user_page = perf_mmap_alloc_page(cpu); if (!rb->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { rb->data_pages[i] = perf_mmap_alloc_page(cpu); if (!rb->data_pages[i]) goto fail_data_pages; } rb->nr_pages = nr_pages; ring_buffer_init(rb, watermark, flags); return rb; fail_data_pages: for (i--; i >= 0; i--) free_page((unsigned long)rb->data_pages[i]); free_page((unsigned long)rb->user_page); fail_user_page: kfree(rb); fail: return NULL; } static void perf_mmap_free_page(unsigned long addr) { struct page *page = virt_to_page((void *)addr); page->mapping = NULL; __free_page(page); } void rb_free(struct ring_buffer *rb) { int i; perf_mmap_free_page((unsigned long)rb->user_page); for (i = 0; i < rb->nr_pages; i++) perf_mmap_free_page((unsigned long)rb->data_pages[i]); kfree(rb); } #else static int data_page_nr(struct ring_buffer *rb) { return rb->nr_pages << page_order(rb); } static struct page * __perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) return NULL; return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE); } static void perf_mmap_unmark_page(void *addr) { struct page *page = vmalloc_to_page(addr); page->mapping = NULL; } static void rb_free_work(struct work_struct *work) { struct ring_buffer *rb; void *base; int i, nr; rb = container_of(work, struct ring_buffer, work); nr = data_page_nr(rb); base = rb->user_page; /* The '<=' counts in the user page. */ for (i = 0; i <= nr; i++) perf_mmap_unmark_page(base + (i * PAGE_SIZE)); vfree(base); kfree(rb); } void rb_free(struct ring_buffer *rb) { schedule_work(&rb->work); } struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) { struct ring_buffer *rb; unsigned long size; void *all_buf; size = sizeof(struct ring_buffer); size += sizeof(void *); rb = kzalloc(size, GFP_KERNEL); if (!rb) goto fail; INIT_WORK(&rb->work, rb_free_work); all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); if (!all_buf) goto fail_all_buf; rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; if (nr_pages) { rb->nr_pages = 1; rb->page_order = ilog2(nr_pages); } ring_buffer_init(rb, watermark, flags); return rb; fail_all_buf: kfree(rb); fail: return NULL; } #endif struct page * perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { if (rb->aux_nr_pages) { /* above AUX space */ if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) return NULL; /* AUX space */ if (pgoff >= rb->aux_pgoff) { int aux_pgoff = array_index_nospec(pgoff - rb->aux_pgoff, rb->aux_nr_pages); return virt_to_page(rb->aux_pages[aux_pgoff]); } } return __perf_mmap_to_page(rb, pgoff); }
1 1 1 3 1 1 1 1 1 1 1 3 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 /** * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University * Copyright (C) 2004-2007 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Michael C. Thompson <mcthomps@us.ibm.com> * Tyler Hicks <tyhicks@ou.edu> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. */ #include <linux/dcache.h> #include <linux/file.h> #include <linux/module.h> #include <linux/namei.h> #include <linux/skbuff.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> #include <linux/parser.h> #include <linux/fs_stack.h> #include <linux/slab.h> #include <linux/magic.h> #include "ecryptfs_kernel.h" /** * Module parameter that defines the ecryptfs_verbosity level. */ int ecryptfs_verbosity = 0; module_param(ecryptfs_verbosity, int, 0); MODULE_PARM_DESC(ecryptfs_verbosity, "Initial verbosity level (0 or 1; defaults to " "0, which is Quiet)"); /** * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; module_param(ecryptfs_message_buf_len, uint, 0); MODULE_PARM_DESC(ecryptfs_message_buf_len, "Number of message buffer elements"); /** * Module parameter that defines the maximum guaranteed amount of time to wait * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if * the message successfully arrives. */ signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; module_param(ecryptfs_message_wait_timeout, long, 0); MODULE_PARM_DESC(ecryptfs_message_wait_timeout, "Maximum number of seconds that an operation will " "sleep while waiting for a message response from " "userspace"); /** * Module parameter that is an estimate of the maximum number of users * that will be concurrently using eCryptfs. Set this to the right * value to balance performance and memory use. */ unsigned int ecryptfs_number_of_users = ECRYPTFS_DEFAULT_NUM_USERS; module_param(ecryptfs_number_of_users, uint, 0); MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " "concurrent users of eCryptfs"); void __ecryptfs_printk(const char *fmt, ...) { va_list args; va_start(args, fmt); if (fmt[1] == '7') { /* KERN_DEBUG */ if (ecryptfs_verbosity >= 1) vprintk(fmt, args); } else vprintk(fmt, args); va_end(args); } /** * ecryptfs_init_lower_file * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with * the lower dentry and the lower mount set * * eCryptfs only ever keeps a single open file for every lower * inode. All I/O operations to the lower inode occur through that * file. When the first eCryptfs dentry that interposes with the first * lower dentry for that inode is created, this function creates the * lower file struct and associates it with the eCryptfs * inode. When all eCryptfs files associated with the inode are released, the * file is closed. * * The lower file will be opened with read/write permissions, if * possible. Otherwise, it is opened read-only. * * This function does nothing if a lower file is already * associated with the eCryptfs inode. * * Returns zero on success; non-zero otherwise */ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); struct path *path = ecryptfs_dentry_to_lower_path(dentry); int rc; rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " "rc = [%d]\n", path->dentry, path->mnt, rc); (*lower_file) = NULL; } return rc; } int ecryptfs_get_lower_file(struct dentry *dentry, struct inode *inode) { struct ecryptfs_inode_info *inode_info; int count, rc = 0; inode_info = ecryptfs_inode_to_private(inode); mutex_lock(&inode_info->lower_file_mutex); count = atomic_inc_return(&inode_info->lower_file_count); if (WARN_ON_ONCE(count < 1)) rc = -EINVAL; else if (count == 1) { rc = ecryptfs_init_lower_file(dentry, &inode_info->lower_file); if (rc) atomic_set(&inode_info->lower_file_count, 0); } mutex_unlock(&inode_info->lower_file_mutex); return rc; } void ecryptfs_put_lower_file(struct inode *inode) { struct ecryptfs_inode_info *inode_info; inode_info = ecryptfs_inode_to_private(inode); if (atomic_dec_and_mutex_lock(&inode_info->lower_file_count, &inode_info->lower_file_mutex)) { filemap_write_and_wait(inode->i_mapping); fput(inode_info->lower_file); inode_info->lower_file = NULL; mutex_unlock(&inode_info->lower_file_mutex); } } enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, ecryptfs_opt_unlink_sigs, ecryptfs_opt_mount_auth_tok_only, ecryptfs_opt_check_dev_ruid, ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, {ecryptfs_opt_cipher, "cipher=%s"}, {ecryptfs_opt_ecryptfs_cipher, "ecryptfs_cipher=%s"}, {ecryptfs_opt_ecryptfs_key_bytes, "ecryptfs_key_bytes=%u"}, {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_unlink_sigs, "ecryptfs_unlink_sigs"}, {ecryptfs_opt_mount_auth_tok_only, "ecryptfs_mount_auth_tok_only"}, {ecryptfs_opt_check_dev_ruid, "ecryptfs_check_dev_ruid"}, {ecryptfs_opt_err, NULL} }; static int ecryptfs_init_global_auth_toks( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { struct ecryptfs_global_auth_tok *global_auth_tok; struct ecryptfs_auth_tok *auth_tok; int rc = 0; list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { rc = ecryptfs_keyring_auth_tok_for_sig( &global_auth_tok->global_auth_tok_key, &auth_tok, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Could not find valid key in user " "session keyring for sig specified in mount " "option: [%s]\n", global_auth_tok->sig); global_auth_tok->flags |= ECRYPTFS_AUTH_TOK_INVALID; goto out; } else { global_auth_tok->flags &= ~ECRYPTFS_AUTH_TOK_INVALID; up_write(&(global_auth_tok->global_auth_tok_key)->sem); } } out: return rc; } static void ecryptfs_init_mount_crypt_stat( struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { memset((void *)mount_crypt_stat, 0, sizeof(struct ecryptfs_mount_crypt_stat)); INIT_LIST_HEAD(&mount_crypt_stat->global_auth_tok_list); mutex_init(&mount_crypt_stat->global_auth_tok_list_mutex); mount_crypt_stat->flags |= ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED; } /** * ecryptfs_parse_options * @sb: The ecryptfs super block * @options: The options passed to the kernel * @check_ruid: set to 1 if device uid should be checked against the ruid * * Parse mount options: * debug=N - ecryptfs_verbosity level for debug output * sig=XXX - description(signature) of the key to use * * Returns the dentry object of the lower-level (lower/interposed) * directory; We want to mount our stackable file system on top of * that lower directory. * * The signature of the key to use must be the description of a key * already in the keyring. Mounting will fail if the key can not be * found. * * Returns zero on success; non-zero on error */ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, uid_t *check_ruid) { char *p; int rc = 0; int sig_set = 0; int cipher_name_set = 0; int fn_cipher_name_set = 0; int cipher_key_bytes; int cipher_key_bytes_set = 0; int fn_cipher_key_bytes; int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &sbi->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; int token; char *sig_src; char *cipher_name_dst; char *cipher_name_src; char *fn_cipher_name_dst; char *fn_cipher_name_src; char *fnek_dst; char *fnek_src; char *cipher_key_bytes_src; char *fn_cipher_key_bytes_src; u8 cipher_code; *check_ruid = 0; if (!options) { rc = -EINVAL; goto out; } ecryptfs_init_mount_crypt_stat(mount_crypt_stat); while ((p = strsep(&options, ",")) != NULL) { if (!*p) continue; token = match_token(p, tokens, args); switch (token) { case ecryptfs_opt_sig: case ecryptfs_opt_ecryptfs_sig: sig_src = args[0].from; rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, sig_src, 0); if (rc) { printk(KERN_ERR "Error attempting to register " "global sig; rc = [%d]\n", rc); goto out; } sig_set = 1; break; case ecryptfs_opt_cipher: case ecryptfs_opt_ecryptfs_cipher: cipher_name_src = args[0].from; cipher_name_dst = mount_crypt_stat-> global_default_cipher_name; strncpy(cipher_name_dst, cipher_name_src, ECRYPTFS_MAX_CIPHER_NAME_SIZE); cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; cipher_name_set = 1; break; case ecryptfs_opt_ecryptfs_key_bytes: cipher_key_bytes_src = args[0].from; cipher_key_bytes = (int)simple_strtol(cipher_key_bytes_src, &cipher_key_bytes_src, 0); mount_crypt_stat->global_default_cipher_key_size = cipher_key_bytes; cipher_key_bytes_set = 1; break; case ecryptfs_opt_passthrough: mount_crypt_stat->flags |= ECRYPTFS_PLAINTEXT_PASSTHROUGH_ENABLED; break; case ecryptfs_opt_xattr_metadata: mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; break; case ecryptfs_opt_encrypted_view: mount_crypt_stat->flags |= ECRYPTFS_XATTR_METADATA_ENABLED; mount_crypt_stat->flags |= ECRYPTFS_ENCRYPTED_VIEW_ENABLED; break; case ecryptfs_opt_fnek_sig: fnek_src = args[0].from; fnek_dst = mount_crypt_stat->global_default_fnek_sig; strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX); mount_crypt_stat->global_default_fnek_sig[ ECRYPTFS_SIG_SIZE_HEX] = '\0'; rc = ecryptfs_add_global_auth_tok( mount_crypt_stat, mount_crypt_stat->global_default_fnek_sig, ECRYPTFS_AUTH_TOK_FNEK); if (rc) { printk(KERN_ERR "Error attempting to register " "global fnek sig [%s]; rc = [%d]\n", mount_crypt_stat->global_default_fnek_sig, rc); goto out; } mount_crypt_stat->flags |= (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); break; case ecryptfs_opt_fn_cipher: fn_cipher_name_src = args[0].from; fn_cipher_name_dst = mount_crypt_stat->global_default_fn_cipher_name; strncpy(fn_cipher_name_dst, fn_cipher_name_src, ECRYPTFS_MAX_CIPHER_NAME_SIZE); mount_crypt_stat->global_default_fn_cipher_name[ ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; fn_cipher_name_set = 1; break; case ecryptfs_opt_fn_cipher_key_bytes: fn_cipher_key_bytes_src = args[0].from; fn_cipher_key_bytes = (int)simple_strtol(fn_cipher_key_bytes_src, &fn_cipher_key_bytes_src, 0); mount_crypt_stat->global_default_fn_cipher_key_bytes = fn_cipher_key_bytes; fn_cipher_key_bytes_set = 1; break; case ecryptfs_opt_unlink_sigs: mount_crypt_stat->flags |= ECRYPTFS_UNLINK_SIGS; break; case ecryptfs_opt_mount_auth_tok_only: mount_crypt_stat->flags |= ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY; break; case ecryptfs_opt_check_dev_ruid: *check_ruid = 1; break; case ecryptfs_opt_err: default: printk(KERN_WARNING "%s: eCryptfs: unrecognized option [%s]\n", __func__, p); } } if (!sig_set) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "You must supply at least one valid " "auth tok signature as a mount " "parameter; see the eCryptfs README\n"); goto out; } if (!cipher_name_set) { int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); BUG_ON(cipher_name_len > ECRYPTFS_MAX_CIPHER_NAME_SIZE); strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !fn_cipher_name_set) strcpy(mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_cipher_name); if (!cipher_key_bytes_set) mount_crypt_stat->global_default_cipher_key_size = 0; if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !fn_cipher_key_bytes_set) mount_crypt_stat->global_default_fn_cipher_key_bytes = mount_crypt_stat->global_default_cipher_key_size; cipher_code = ecryptfs_code_for_cipher_string( mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); if (!cipher_code) { ecryptfs_printk(KERN_ERR, "eCryptfs doesn't support cipher: %s", mount_crypt_stat->global_default_cipher_name); rc = -EINVAL; goto out; } mutex_lock(&key_tfm_list_mutex); if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); if (rc) { printk(KERN_ERR "Error attempting to initialize " "cipher with name = [%s] and key size = [%td]; " "rc = [%d]\n", mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size, rc); rc = -EINVAL; mutex_unlock(&key_tfm_list_mutex); goto out; } } if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) && !ecryptfs_tfm_exists( mount_crypt_stat->global_default_fn_cipher_name, NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_fn_cipher_key_bytes); if (rc) { printk(KERN_ERR "Error attempting to initialize " "cipher with name = [%s] and key size = [%td]; " "rc = [%d]\n", mount_crypt_stat->global_default_fn_cipher_name, mount_crypt_stat->global_default_fn_cipher_key_bytes, rc); rc = -EINVAL; mutex_unlock(&key_tfm_list_mutex); goto out; } } mutex_unlock(&key_tfm_list_mutex); rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); if (rc) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); out: return rc; } struct kmem_cache *ecryptfs_sb_info_cache; static struct file_system_type ecryptfs_fs_type; /** * ecryptfs_get_sb * @fs_type * @flags * @dev_name: The path to mount over * @raw_data: The options passed into the kernel */ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data) { struct super_block *s; struct ecryptfs_sb_info *sbi; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; uid_t check_ruid; int rc; sbi = kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL); if (!sbi) { rc = -ENOMEM; goto out; } if (!dev_name) { rc = -EINVAL; err = "Device name cannot be null"; goto out; } rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; goto out; } mount_crypt_stat = &sbi->mount_crypt_stat; s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) { rc = PTR_ERR(s); goto out; } rc = super_setup_bdi(s); if (rc) goto out1; ecryptfs_set_superblock_private(s, sbi); /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; s->s_op = &ecryptfs_sops; s->s_xattr = ecryptfs_xattr_handlers; s->s_d_op = &ecryptfs_dops; err = "Reading sb failed"; rc = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (rc) { ecryptfs_printk(KERN_WARNING, "kern_path() failed\n"); goto out1; } if (path.dentry->d_sb->s_type == &ecryptfs_fs_type) { rc = -EINVAL; printk(KERN_ERR "Mount on filesystem of type " "eCryptfs explicitly disallowed due to " "known incompatibilities\n"); goto out_free; } if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", i_uid_read(d_inode(path.dentry)), from_kuid(&init_user_ns, current_uid())); goto out_free; } ecryptfs_set_superblock_lower(s, path.dentry->d_sb); /** * Set the POSIX ACL flag based on whether they're enabled in the lower * mount. */ s->s_flags = flags & ~MS_POSIXACL; s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL; /** * Force a read-only eCryptfs mount when: * 1) The lower mount is ro * 2) The ecryptfs_encrypted_view mount option is specified */ if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) s->s_flags |= MS_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; rc = -EINVAL; if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); goto out_free; } inode = ecryptfs_get_inode(d_inode(path.dentry), s); rc = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free; s->s_root = d_make_root(inode); if (!s->s_root) { rc = -ENOMEM; goto out_free; } rc = -ENOMEM; root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); if (!root_info) goto out_free; /* ->kill_sb() will take care of root_info */ ecryptfs_set_dentry_private(s->s_root, root_info); root_info->lower_path = path; s->s_flags |= MS_ACTIVE; return dget(s->s_root); out_free: path_put(&path); out1: deactivate_locked_super(s); out: if (sbi) { ecryptfs_destroy_mount_crypt_stat(&sbi->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sbi); } printk(KERN_ERR "%s; rc = [%d]\n", err, rc); return ERR_PTR(rc); } /** * ecryptfs_kill_block_super * @sb: The ecryptfs super block * * Used to bring the superblock down and free the private data. */ static void ecryptfs_kill_block_super(struct super_block *sb) { struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb); kill_anon_super(sb); if (!sb_info) return; ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } static struct file_system_type ecryptfs_fs_type = { .owner = THIS_MODULE, .name = "ecryptfs", .mount = ecryptfs_mount, .kill_sb = ecryptfs_kill_block_super, .fs_flags = 0 }; MODULE_ALIAS_FS("ecryptfs"); /** * inode_info_init_once * * Initializes the ecryptfs_inode_info_cache when it is created */ static void inode_info_init_once(void *vptr) { struct ecryptfs_inode_info *ei = (struct ecryptfs_inode_info *)vptr; inode_init_once(&ei->vfs_inode); } static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; unsigned long flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { .cache = &ecryptfs_auth_tok_list_item_cache, .name = "ecryptfs_auth_tok_list_item", .size = sizeof(struct ecryptfs_auth_tok_list_item), }, { .cache = &ecryptfs_file_info_cache, .name = "ecryptfs_file_cache", .size = sizeof(struct ecryptfs_file_info), }, { .cache = &ecryptfs_dentry_info_cache, .name = "ecryptfs_dentry_info_cache", .size = sizeof(struct ecryptfs_dentry_info), }, { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { .cache = &ecryptfs_sb_info_cache, .name = "ecryptfs_sb_cache", .size = sizeof(struct ecryptfs_sb_info), }, { .cache = &ecryptfs_header_cache, .name = "ecryptfs_headers", .size = PAGE_SIZE, }, { .cache = &ecryptfs_xattr_cache, .name = "ecryptfs_xattr_cache", .size = PAGE_SIZE, }, { .cache = &ecryptfs_key_record_cache, .name = "ecryptfs_key_record_cache", .size = sizeof(struct ecryptfs_key_record), }, { .cache = &ecryptfs_key_sig_cache, .name = "ecryptfs_key_sig_cache", .size = sizeof(struct ecryptfs_key_sig), }, { .cache = &ecryptfs_global_auth_tok_cache, .name = "ecryptfs_global_auth_tok_cache", .size = sizeof(struct ecryptfs_global_auth_tok), }, { .cache = &ecryptfs_key_tfm_cache, .name = "ecryptfs_key_tfm_cache", .size = sizeof(struct ecryptfs_key_tfm), }, }; static void ecryptfs_free_kmem_caches(void) { int i; /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; kmem_cache_destroy(*(info->cache)); } } /** * ecryptfs_init_kmem_caches * * Returns zero on success; non-zero otherwise */ static int ecryptfs_init_kmem_caches(void) { int i; for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) { struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; *(info->cache) = kmem_cache_create(info->name, info->size, 0, SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " "kmem_cache_create failed\n", info->name); return -ENOMEM; } } return 0; } static struct kobject *ecryptfs_kobj; static ssize_t version_show(struct kobject *kobj, struct kobj_attribute *attr, char *buff) { return snprintf(buff, PAGE_SIZE, "%d\n", ECRYPTFS_VERSIONING_MASK); } static struct kobj_attribute version_attr = __ATTR_RO(version); static struct attribute *attributes[] = { &version_attr.attr, NULL, }; static struct attribute_group attr_group = { .attrs = attributes, }; static int do_sysfs_registration(void) { int rc; ecryptfs_kobj = kobject_create_and_add("ecryptfs", fs_kobj); if (!ecryptfs_kobj) { printk(KERN_ERR "Unable to create ecryptfs kset\n"); rc = -ENOMEM; goto out; } rc = sysfs_create_group(ecryptfs_kobj, &attr_group); if (rc) { printk(KERN_ERR "Unable to create ecryptfs version attributes\n"); kobject_put(ecryptfs_kobj); } out: return rc; } static void do_sysfs_unregistration(void) { sysfs_remove_group(ecryptfs_kobj, &attr_group); kobject_put(ecryptfs_kobj); } static int __init ecryptfs_init(void) { int rc; if (ECRYPTFS_DEFAULT_EXTENT_SIZE > PAGE_SIZE) { rc = -EINVAL; ecryptfs_printk(KERN_ERR, "The eCryptfs extent size is " "larger than the host's page size, and so " "eCryptfs cannot run on this system. The " "default eCryptfs extent size is [%u] bytes; " "the page size is [%lu] bytes.\n", ECRYPTFS_DEFAULT_EXTENT_SIZE, (unsigned long)PAGE_SIZE); goto out; } rc = ecryptfs_init_kmem_caches(); if (rc) { printk(KERN_ERR "Failed to allocate one or more kmem_cache objects\n"); goto out; } rc = do_sysfs_registration(); if (rc) { printk(KERN_ERR "sysfs registration failed\n"); goto out_free_kmem_caches; } rc = ecryptfs_init_kthread(); if (rc) { printk(KERN_ERR "%s: kthread initialization failed; " "rc = [%d]\n", __func__, rc); goto out_do_sysfs_unregistration; } rc = ecryptfs_init_messaging(); if (rc) { printk(KERN_ERR "Failure occurred while attempting to " "initialize the communications channel to " "ecryptfsd\n"); goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); if (rc) { printk(KERN_ERR "Failure whilst attempting to init crypto; " "rc = [%d]\n", rc); goto out_release_messaging; } rc = register_filesystem(&ecryptfs_fs_type); if (rc) { printk(KERN_ERR "Failed to register filesystem\n"); goto out_destroy_crypto; } if (ecryptfs_verbosity > 0) printk(KERN_CRIT "eCryptfs verbosity set to %d. Secret values " "will be written to the syslog!\n", ecryptfs_verbosity); goto out; out_destroy_crypto: ecryptfs_destroy_crypto(); out_release_messaging: ecryptfs_release_messaging(); out_destroy_kthread: ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: do_sysfs_unregistration(); out_free_kmem_caches: ecryptfs_free_kmem_caches(); out: return rc; } static void __exit ecryptfs_exit(void) { int rc; rc = ecryptfs_destroy_crypto(); if (rc) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); ecryptfs_release_messaging(); ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); ecryptfs_free_kmem_caches(); } MODULE_AUTHOR("Michael A. Halcrow <mhalcrow@us.ibm.com>"); MODULE_DESCRIPTION("eCryptfs"); MODULE_LICENSE("GPL"); module_init(ecryptfs_init) module_exit(ecryptfs_exit)
4 4 2 1 1 1 1 1 1 4 1 1 1 1 1 2 1 24 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* * net/sched/act_sample.c - Packet sampling tc action * Copyright (c) 2017 Yotam Gigi <yotamg@mellanox.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/gfp.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_sample.h> #include <net/tc_act/tc_sample.h> #include <net/psample.h> #include <linux/if_arp.h> static unsigned int sample_net_id; static struct tc_action_ops act_sample_ops; static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { [TCA_SAMPLE_PARMS] = { .len = sizeof(struct tc_sample) }, [TCA_SAMPLE_RATE] = { .type = NLA_U32 }, [TCA_SAMPLE_TRUNC_SIZE] = { .type = NLA_U32 }, [TCA_SAMPLE_PSAMPLE_GROUP] = { .type = NLA_U32 }, }; static int tcf_sample_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) { struct tc_action_net *tn = net_generic(net, sample_net_id); struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; struct tc_sample *parm; struct tcf_sample *s; bool exists = false; u32 rate; int ret; if (!nla) return -EINVAL; ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] || !tb[TCA_SAMPLE_PSAMPLE_GROUP]) return -EINVAL; parm = nla_data(tb[TCA_SAMPLE_PARMS]); exists = tcf_idr_check(tn, parm->index, a, bind); if (exists && bind) return 0; if (!exists) { ret = tcf_idr_create(tn, parm->index, est, a, &act_sample_ops, bind, true); if (ret) return ret; ret = ACT_P_CREATED; } else { tcf_idr_release(*a, bind); if (!ovr) return -EEXIST; } rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); if (!rate) { tcf_idr_release(*a, bind); return -EINVAL; } s = to_sample(*a); s->tcf_action = parm->action; s->rate = nla_get_u32(tb[TCA_SAMPLE_RATE]); s->rate = rate; s->psample_group_num = nla_get_u32(tb[TCA_SAMPLE_PSAMPLE_GROUP]); psample_group = psample_group_get(net, s->psample_group_num); if (!psample_group) { if (ret == ACT_P_CREATED) tcf_idr_release(*a, bind); return -ENOMEM; } rcu_swap_protected(s->psample_group, psample_group, lockdep_is_held(&s->tcf_lock)); if (tb[TCA_SAMPLE_TRUNC_SIZE]) { s->truncate = true; s->trunc_size = nla_get_u32(tb[TCA_SAMPLE_TRUNC_SIZE]); } if (psample_group) psample_group_put(psample_group); if (ret == ACT_P_CREATED) tcf_idr_insert(tn, *a); return ret; } static void tcf_sample_cleanup_rcu(struct rcu_head *rcu) { struct tcf_sample *s = container_of(rcu, struct tcf_sample, rcu); struct psample_group *psample_group; psample_group = rcu_dereference_protected(s->psample_group, 1); RCU_INIT_POINTER(s->psample_group, NULL); if (psample_group) psample_group_put(psample_group); } static void tcf_sample_cleanup(struct tc_action *a, int bind) { struct tcf_sample *s = to_sample(a); call_rcu(&s->rcu, tcf_sample_cleanup_rcu); } static bool tcf_sample_dev_ok_push(struct net_device *dev) { switch (dev->type) { case ARPHRD_TUNNEL: case ARPHRD_TUNNEL6: case ARPHRD_SIT: case ARPHRD_IPGRE: case ARPHRD_IP6GRE: case ARPHRD_VOID: case ARPHRD_NONE: return false; default: return true; } } static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_sample *s = to_sample(a); struct psample_group *psample_group; int retval; int size; int iif; int oif; tcf_lastuse_update(&s->tcf_tm); bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); retval = READ_ONCE(s->tcf_action); rcu_read_lock(); psample_group = rcu_dereference(s->psample_group); /* randomly sample packets according to rate */ if (psample_group && (prandom_u32() % s->rate == 0)) { if (!skb_at_tc_ingress(skb)) { iif = skb->skb_iif; oif = skb->dev->ifindex; } else { iif = skb->dev->ifindex; oif = 0; } /* on ingress, the mac header gets popped, so push it back */ if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_push(skb, skb->mac_len); size = s->truncate ? s->trunc_size : skb->len; psample_sample_packet(psample_group, skb, size, iif, oif, s->rate); if (skb_at_tc_ingress(skb) && tcf_sample_dev_ok_push(skb->dev)) skb_pull(skb, skb->mac_len); } rcu_read_unlock(); return retval; } static int tcf_sample_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_sample *s = to_sample(a); struct tc_sample opt = { .index = s->tcf_index, .action = s->tcf_action, .refcnt = s->tcf_refcnt - ref, .bindcnt = s->tcf_bindcnt - bind, }; struct tcf_t t; if (nla_put(skb, TCA_SAMPLE_PARMS, sizeof(opt), &opt)) goto nla_put_failure; tcf_tm_dump(&t, &s->tcf_tm); if (nla_put_64bit(skb, TCA_SAMPLE_TM, sizeof(t), &t, TCA_SAMPLE_PAD)) goto nla_put_failure; if (nla_put_u32(skb, TCA_SAMPLE_RATE, s->rate)) goto nla_put_failure; if (s->truncate) if (nla_put_u32(skb, TCA_SAMPLE_TRUNC_SIZE, s->trunc_size)) goto nla_put_failure; if (nla_put_u32(skb, TCA_SAMPLE_PSAMPLE_GROUP, s->psample_group_num)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tcf_sample_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops) { struct tc_action_net *tn = net_generic(net, sample_net_id); return tcf_generic_walker(tn, skb, cb, type, ops); } static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, sample_net_id); return tcf_idr_search(tn, a, index); } static struct tc_action_ops act_sample_ops = { .kind = "sample", .type = TCA_ACT_SAMPLE, .owner = THIS_MODULE, .act = tcf_sample_act, .dump = tcf_sample_dump, .init = tcf_sample_init, .cleanup = tcf_sample_cleanup, .walk = tcf_sample_walker, .lookup = tcf_sample_search, .size = sizeof(struct tcf_sample), }; static __net_init int sample_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, sample_net_id); return tc_action_net_init(net, tn, &act_sample_ops); } static void __net_exit sample_exit_net(struct net *net) { struct tc_action_net *tn = net_generic(net, sample_net_id); tc_action_net_exit(tn); } static struct pernet_operations sample_net_ops = { .init = sample_init_net, .exit = sample_exit_net, .id = &sample_net_id, .size = sizeof(struct tc_action_net), }; static int __init sample_init_module(void) { return tcf_register_action(&act_sample_ops, &sample_net_ops); } static void __exit sample_cleanup_module(void) { rcu_barrier(); tcf_unregister_action(&act_sample_ops, &sample_net_ops); } module_init(sample_init_module); module_exit(sample_cleanup_module); MODULE_AUTHOR("Yotam Gigi <yotam.gi@gmail.com>"); MODULE_DESCRIPTION("Packet sampling action"); MODULE_LICENSE("GPL v2");
35 33 31 29 26 26 26 26 25 26 9 2 2 2 2 25 1 24 16 2 6 6 2 6 9 17 13 8 8 8 8 5 5 8 1 8 1 8 3 3 2 4 3 3 2 2 1 1 2 3 2 1 2 2 5 39 35 31 7 7 6 6 6 1 5 6 6 5 6 29 29 19 10 29 29 29 18 12 27 2 1 2 27 16 27 28 27 28 18 28 17 1 28 25 8 26 12 26 25 28 30 7 23 29 3 28 24 3 3 2 1 1 22 22 4 4 18 22 17 2 1 2 1 2 20 20 2 2 2 18 6 6 1 6 18 5 20 4 4 16 16 20 20 2 2 2 99 97 93 94 93 47 46 45 40 6 6 3 2 6 88 99 3 3 3 2 136 14 14 1825 32 13 12 1 12 2 1 1 1 1 12 2 12 3 12 3 26 22 7 6 5 15 20 20 20 23 26 5 2 4 4 4 5 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the name of Volkswagen nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * Alternatively, provided that this notice is retained in full, this * software may be distributed under the terms of the GNU General * Public License ("GPL") version 2, in which case the provisions of the * GPL apply INSTEAD OF those given above. * * The provided data structures and external interfaces from this code * are not restricted to be used by modules with a GPL compatible license. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. * */ #include <linux/module.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/hrtimer.h> #include <linux/list.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/uio.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/socket.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/bcm.h> #include <linux/slab.h> #include <net/sock.h> #include <net/net_namespace.h> /* * To send multiple CAN frame content within TX_SETUP or to filter * CAN messages with multiplex index within RX_SETUP, the number of * different filters is limited to 256 due to the one byte index value. */ #define MAX_NFRAMES 256 /* limit timers to 400 days for sending/timeouts */ #define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60) /* use of last_frames[index].flags */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ #define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) #define CAN_BCM_VERSION "20170425" MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Oliver Hartkopp <oliver.hartkopp@volkswagen.de>"); MODULE_ALIAS("can-proto-2"); /* * easy access to the first 64 bit of can(fd)_frame payload. cp->data is * 64 bit aligned so the offset has to be multiples of 8 which is ensured * by the only callers in bcm_rx_cmp_to_index() bcm_rx_handler(). */ static inline u64 get_u64(const struct canfd_frame *cp, int offset) { return *(u64 *)(cp->data + offset); } struct bcm_op { struct list_head list; int ifindex; canid_t can_id; u32 flags; unsigned long frames_abs, frames_filtered; struct bcm_timeval ival1, ival2; struct hrtimer timer, thrtimer; struct tasklet_struct tsklet, thrtsklet; ktime_t rx_stamp, kt_ival1, kt_ival2, kt_lastmsg; int rx_ifindex; int cfsiz; u32 count; u32 nframes; u32 currframe; /* void pointers to arrays of struct can[fd]_frame */ void *frames; void *last_frames; struct canfd_frame sframe; struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; }; struct bcm_sock { struct sock sk; int bound; int ifindex; struct list_head notifier; struct list_head rx_ops; struct list_head tx_ops; unsigned long dropped_usr_msgs; struct proc_dir_entry *bcm_proc_read; char procname [32]; /* inode number in decimal with \0 */ }; static LIST_HEAD(bcm_notifier_list); static DEFINE_SPINLOCK(bcm_notifier_lock); static struct bcm_sock *bcm_busy_notifier; static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; } static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) { return ktime_set(tv.tv_sec, tv.tv_usec * NSEC_PER_USEC); } /* check limitations for timeval provided by user */ static bool bcm_is_invalid_tv(struct bcm_msg_head *msg_head) { if ((msg_head->ival1.tv_sec < 0) || (msg_head->ival1.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival1.tv_usec < 0) || (msg_head->ival1.tv_usec >= USEC_PER_SEC) || (msg_head->ival2.tv_sec < 0) || (msg_head->ival2.tv_sec > BCM_TIMER_SEC_MAX) || (msg_head->ival2.tv_usec < 0) || (msg_head->ival2.tv_usec >= USEC_PER_SEC)) return true; return false; } #define CFSIZ(flags) ((flags & CAN_FD_FRAME) ? CANFD_MTU : CAN_MTU) #define OPSIZ sizeof(struct bcm_op) #define MHSIZ sizeof(struct bcm_msg_head) /* * procfs functions */ #if IS_ENABLED(CONFIG_PROC_FS) static char *bcm_proc_getifname(struct net *net, char *result, int ifindex) { struct net_device *dev; if (!ifindex) return "any"; rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) strcpy(result, dev->name); else strcpy(result, "???"); rcu_read_unlock(); return result; } static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; struct net *net = m->private; struct sock *sk = (struct sock *)PDE_DATA(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; seq_printf(m, ">>> socket %pK", sk->sk_socket); seq_printf(m, " / sk %pK", sk); seq_printf(m, " / bo %pK", bo); seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs); seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); list_for_each_entry(op, &bo->rx_ops, list) { unsigned long reduction; /* print only active entries & prevent division by zero */ if (!op->frames_abs) continue; seq_printf(m, "rx_op: %03X %-5s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u)", op->nframes); else seq_printf(m, "[%u]", op->nframes); seq_printf(m, "%c ", (op->flags & RX_CHECK_DLC) ? 'd' : ' '); if (op->kt_ival1) seq_printf(m, "timeo=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "thr=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# recv %ld (%ld) => reduction: ", op->frames_filtered, op->frames_abs); reduction = 100 - (op->frames_filtered * 100) / op->frames_abs; seq_printf(m, "%s%ld%%\n", (reduction == 100) ? "near " : "", reduction); } list_for_each_entry(op, &bo->tx_ops, list) { seq_printf(m, "tx_op: %03X %s ", op->can_id, bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u) ", op->nframes); else seq_printf(m, "[%u] ", op->nframes); if (op->kt_ival1) seq_printf(m, "t1=%lld ", (long long)ktime_to_us(op->kt_ival1)); if (op->kt_ival2) seq_printf(m, "t2=%lld ", (long long)ktime_to_us(op->kt_ival2)); seq_printf(m, "# sent %ld\n", op->frames_abs); } seq_putc(m, '\n'); return 0; } static int bcm_proc_open(struct inode *inode, struct file *file) { return single_open_net(inode, file, bcm_proc_show); } static const struct file_operations bcm_proc_fops = { .owner = THIS_MODULE, .open = bcm_proc_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; #endif /* CONFIG_PROC_FS */ /* * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface * of the given bcm tx op */ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; int err; /* no target device? => exit */ if (!op->ifindex) return; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ return; } skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); if (!skb) goto out; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_put_data(skb, cf, op->cfsiz); /* send with loopback */ skb->dev = dev; can_skb_set_owner(skb, op->sk); err = can_send(skb, 1); if (!err) op->frames_abs++; op->currframe++; /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; out: dev_put(dev); } /* * bcm_send_to_user - send a BCM message to the userspace * (consisting of bcm_msg_head + x CAN frames) */ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, struct canfd_frame *frames, int has_timestamp) { struct sk_buff *skb; struct canfd_frame *firstframe; struct sockaddr_can *addr; struct sock *sk = op->sk; unsigned int datalen = head->nframes * op->cfsiz; int err; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); if (!skb) return; skb_put_data(skb, head, sizeof(*head)); if (head->nframes) { /* CAN frames starting here */ firstframe = (struct canfd_frame *)skb_tail_pointer(skb); skb_put_data(skb, frames, datalen); /* * the BCM uses the flags-element of the canfd_frame * structure for internal purposes. This is only * relevant for updates that are generated by the * BCM, where nframes is 1 */ if (head->nframes == 1) firstframe->flags &= BCM_CAN_FLAGS_MASK; } if (has_timestamp) { /* restore rx timestamp */ skb->tstamp = op->rx_stamp; } /* * Put the datagram to the queue so that bcm_recvmsg() can * get it from there. We need to pass the interface index to * bcm_recvmsg(). We pass a whole struct sockaddr_can in skb->cb * containing the interface index. */ sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; addr->can_ifindex = op->rx_ifindex; err = sock_queue_rcv_skb(sk, skb); if (err < 0) { struct bcm_sock *bo = bcm_sk(sk); kfree_skb(skb); /* don't care about overflows in this statistic */ bo->dropped_usr_msgs++; } } static void bcm_tx_start_timer(struct bcm_op *op) { if (op->kt_ival1 && op->count) hrtimer_start(&op->timer, ktime_add(ktime_get(), op->kt_ival1), HRTIMER_MODE_ABS); else if (op->kt_ival2) hrtimer_start(&op->timer, ktime_add(ktime_get(), op->kt_ival2), HRTIMER_MODE_ABS); } static void bcm_tx_timeout_tsklet(unsigned long data) { struct bcm_op *op = (struct bcm_op *)data; struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { op->count--; if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = TX_EXPIRED; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); } bcm_can_tx(op); } else if (op->kt_ival2) bcm_can_tx(op); bcm_tx_start_timer(op); } /* * bcm_tx_timeout_handler - performs cyclic CAN frame transmissions */ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); tasklet_schedule(&op->tsklet); return HRTIMER_NORESTART; } /* * bcm_rx_changed - create a RX_CHANGED notification due to changed content */ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) { struct bcm_msg_head head; /* update statistics */ op->frames_filtered++; /* prevent statistics overflow */ if (op->frames_filtered > ULONG_MAX/100) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV); memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; head.flags = op->flags; head.count = op->count; head.ival1 = op->ival1; head.ival2 = op->ival2; head.can_id = op->can_id; head.nframes = 1; bcm_send_to_user(op, &head, data, 1); } /* * bcm_rx_update_and_send - process a detected relevant receive content change * 1. update the last received data * 2. send a notification to the user (if possible) */ static void bcm_rx_update_and_send(struct bcm_op *op, struct canfd_frame *lastdata, const struct canfd_frame *rxdata) { memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ lastdata->flags |= (RX_RECV|RX_THR); /* throttling mode inactive ? */ if (!op->kt_ival2) { /* send RX_CHANGED to the user immediately */ bcm_rx_changed(op, lastdata); return; } /* with active throttling timer we are just done here */ if (hrtimer_active(&op->thrtimer)) return; /* first reception with enabled throttling mode */ if (!op->kt_lastmsg) goto rx_changed_settime; /* got a second frame inside a potential throttle period? */ if (ktime_us_delta(ktime_get(), op->kt_lastmsg) < ktime_to_us(op->kt_ival2)) { /* do not send the saved data - only start throttle timer */ hrtimer_start(&op->thrtimer, ktime_add(op->kt_lastmsg, op->kt_ival2), HRTIMER_MODE_ABS); return; } /* the gap was that big, that throttling was not needed here */ rx_changed_settime: bcm_rx_changed(op, lastdata); op->kt_lastmsg = ktime_get(); } /* * bcm_rx_cmp_to_index - (bit)compares the currently received data to formerly * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, const struct canfd_frame *rxdata) { struct canfd_frame *cf = op->frames + op->cfsiz * index; struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; int i; /* * no one uses the MSBs of flags for comparison, * so we use it here to detect the first time of reception */ if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ bcm_rx_update_and_send(op, lcf, rxdata); return; } /* do a real check in CAN frame data section */ for (i = 0; i < rxdata->len; i += 8) { if ((get_u64(cf, i) & get_u64(rxdata, i)) != (get_u64(cf, i) & get_u64(lcf, i))) { bcm_rx_update_and_send(op, lcf, rxdata); return; } } if (op->flags & RX_CHECK_DLC) { /* do a real check in CAN frame length */ if (rxdata->len != lcf->len) { bcm_rx_update_and_send(op, lcf, rxdata); return; } } } /* * bcm_rx_starttimer - enable timeout monitoring for CAN frame reception */ static void bcm_rx_starttimer(struct bcm_op *op) { if (op->flags & RX_NO_AUTOTIMER) return; if (op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); } static void bcm_rx_timeout_tsklet(unsigned long data) { struct bcm_op *op = (struct bcm_op *)data; struct bcm_msg_head msg_head; /* create notification to user */ memset(&msg_head, 0, sizeof(msg_head)); msg_head.opcode = RX_TIMEOUT; msg_head.flags = op->flags; msg_head.count = op->count; msg_head.ival1 = op->ival1; msg_head.ival2 = op->ival2; msg_head.can_id = op->can_id; msg_head.nframes = 0; bcm_send_to_user(op, &msg_head, NULL, 0); } /* * bcm_rx_timeout_handler - when the (cyclic) CAN frame reception timed out */ static enum hrtimer_restart bcm_rx_timeout_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, timer); /* schedule before NET_RX_SOFTIRQ */ tasklet_hi_schedule(&op->tsklet); /* no restart of the timer is done here! */ /* if user wants to be informed, when cyclic CAN-Messages come back */ if ((op->flags & RX_ANNOUNCE_RESUME) && op->last_frames) { /* clear received CAN frames to indicate 'nothing received' */ memset(op->last_frames, 0, op->nframes * op->cfsiz); } return HRTIMER_NORESTART; } /* * bcm_rx_do_flush - helper for bcm_rx_thr_flush */ static inline int bcm_rx_do_flush(struct bcm_op *op, int update, unsigned int index) { struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; if ((op->last_frames) && (lcf->flags & RX_THR)) { if (update) bcm_rx_changed(op, lcf); return 1; } return 0; } /* * bcm_rx_thr_flush - Check for throttled data and send it to the userspace * * update == 0 : just check if throttled data is available (any irq context) * update == 1 : check and send throttled data to userspace (soft_irq context) */ static int bcm_rx_thr_flush(struct bcm_op *op, int update) { int updated = 0; if (op->nframes > 1) { unsigned int i; /* for MUX filter we start at index 1 */ for (i = 1; i < op->nframes; i++) updated += bcm_rx_do_flush(op, update, i); } else { /* for RX_FILTER_ID and simple filter */ updated += bcm_rx_do_flush(op, update, 0); } return updated; } static void bcm_rx_thr_tsklet(unsigned long data) { struct bcm_op *op = (struct bcm_op *)data; /* push the changed data to the userspace */ bcm_rx_thr_flush(op, 1); } /* * bcm_rx_thr_handler - the time for blocked content updates is over now: * Check for throttled data and send it to the userspace */ static enum hrtimer_restart bcm_rx_thr_handler(struct hrtimer *hrtimer) { struct bcm_op *op = container_of(hrtimer, struct bcm_op, thrtimer); tasklet_schedule(&op->thrtsklet); if (bcm_rx_thr_flush(op, 0)) { hrtimer_forward(hrtimer, ktime_get(), op->kt_ival2); return HRTIMER_RESTART; } else { /* rearm throttle handling */ op->kt_lastmsg = 0; return HRTIMER_NORESTART; } } /* * bcm_rx_handler - handle a CAN frame reception */ static void bcm_rx_handler(struct sk_buff *skb, void *data) { struct bcm_op *op = (struct bcm_op *)data; const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; if (op->can_id != rxframe->can_id) return; /* make sure to handle the correct frame type (CAN / CAN FD) */ if (skb->len != op->cfsiz) return; /* disable timeout */ hrtimer_cancel(&op->timer); /* save rx timestamp */ op->rx_stamp = skb->tstamp; /* save originator for recvfrom() */ op->rx_ifindex = skb->dev->ifindex; /* update statistics */ op->frames_abs++; if (op->flags & RX_RTR_FRAME) { /* send reply for RTR-request (placed in op->frames[0]) */ bcm_can_tx(op); return; } if (op->flags & RX_FILTER_ID) { /* the easiest case */ bcm_rx_update_and_send(op, op->last_frames, rxframe); goto rx_starttimer; } if (op->nframes == 1) { /* simple compare with index 0 */ bcm_rx_cmp_to_index(op, 0, rxframe); goto rx_starttimer; } if (op->nframes > 1) { /* * multiplex compare * * find the first multiplex mask that fits. * Remark: The MUX-mask is stored in index 0 - but only the * first 64 bits of the frame data[] are relevant (CAN FD) */ for (i = 1; i < op->nframes; i++) { if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == (get_u64(op->frames, 0) & get_u64(op->frames + op->cfsiz * i, 0))) { bcm_rx_cmp_to_index(op, i, rxframe); break; } } } rx_starttimer: bcm_rx_starttimer(op); } /* * helpers for bcm_op handling: find & delete bcm [rx|tx] op elements */ static struct bcm_op *bcm_find_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op; list_for_each_entry(op, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) return op; } return NULL; } static void bcm_remove_op(struct bcm_op *op) { if (op->tsklet.func) { do { tasklet_kill(&op->tsklet); hrtimer_cancel(&op->timer); } while (test_bit(TASKLET_STATE_SCHED, &op->tsklet.state) || test_bit(TASKLET_STATE_RUN, &op->tsklet.state) || hrtimer_active(&op->timer)); } if (op->thrtsklet.func) { do { tasklet_kill(&op->thrtsklet); hrtimer_cancel(&op->thrtimer); } while (test_bit(TASKLET_STATE_SCHED, &op->thrtsklet.state) || test_bit(TASKLET_STATE_RUN, &op->thrtsklet.state) || hrtimer_active(&op->thrtimer)); } if ((op->frames) && (op->frames != &op->sframe)) kfree(op->frames); if ((op->last_frames) && (op->last_frames != &op->last_sframe)) kfree(op->last_frames); kfree(op); } static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) { if (op->rx_reg_dev == dev) { can_rx_unregister(dev_net(dev), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); /* mark as removed subscription */ op->rx_reg_dev = NULL; } else printk(KERN_ERR "can-bcm: bcm_rx_unreg: registered device " "mismatch %p %p\n", op->rx_reg_dev, dev); } /* * bcm_delete_rx_op - find and remove a rx op (returns number of removed ops) */ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { /* * Don't care if we're bound or not (due to netdev * problems) can_rx_unregister() is always a save * thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(sock_net(op->sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); list_del(&op->list); synchronize_rcu(); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_delete_tx_op - find and remove a tx op (returns number of removed ops) */ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, int ifindex) { struct bcm_op *op, *n; list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { list_del(&op->list); bcm_remove_op(op); return 1; /* done */ } } return 0; /* not found */ } /* * bcm_read_op - read out a bcm_op and send it to the user (for bcm_sendmsg) */ static int bcm_read_op(struct list_head *ops, struct bcm_msg_head *msg_head, int ifindex) { struct bcm_op *op = bcm_find_op(ops, msg_head, ifindex); if (!op) return -EINVAL; /* put current values into msg_head */ msg_head->flags = op->flags; msg_head->count = op->count; msg_head->ival1 = op->ival1; msg_head->ival2 = op->ival2; msg_head->nframes = op->nframes; bcm_send_to_user(op, msg_head, op->frames, 0); return MHSIZ; } /* * bcm_tx_setup - create or update a bcm tx op (for bcm_sendmsg) */ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; struct canfd_frame *cf; unsigned int i; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; /* check nframes boundaries - we need at least one CAN frame */ if (msg_head->nframes < 1 || msg_head->nframes > MAX_NFRAMES) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->tx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; /* update CAN frames content */ for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) return err; if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } op->flags = msg_head->flags; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { op->frames = kmalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } } else op->frames = &op->sframe; for (i = 0; i < msg_head->nframes; i++) { cf = op->frames + op->cfsiz * i; err = memcpy_from_msg((u8 *)cf, msg, op->cfsiz); if (op->flags & CAN_FD_FRAME) { if (cf->len > 64) err = -EINVAL; } else { if (cf->len > 8) err = -EINVAL; } if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); kfree(op); return err; } if (msg_head->flags & TX_CP_CAN_ID) { /* copy can_id into frame */ cf->can_id = msg_head->can_id; } } /* tx_ops never compare with previous received messages */ op->last_frames = NULL; /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); op->timer.function = bcm_tx_timeout_handler; /* initialize tasklet for tx countevent notification */ tasklet_init(&op->tsklet, bcm_tx_timeout_tsklet, (unsigned long) op); /* currently unused in tx_ops */ hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); /* add this bcm_op to the list of the tx_ops */ list_add(&op->list, &bo->tx_ops); } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ if (op->nframes != msg_head->nframes) { op->nframes = msg_head->nframes; /* start multiple frame transmission with index 0 */ op->currframe = 0; } /* check flags */ if (op->flags & TX_RESET_MULTI_IDX) { /* start multiple frame transmission with index 0 */ op->currframe = 0; } if (op->flags & SETTIMER) { /* set timer values */ op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero values? */ if (!op->kt_ival1 && !op->kt_ival2) hrtimer_cancel(&op->timer); } if (op->flags & STARTTIMER) { hrtimer_cancel(&op->timer); /* spec: send CAN frame when starting timer */ op->flags |= TX_ANNOUNCE; } if (op->flags & TX_ANNOUNCE) { bcm_can_tx(op); if (op->count) op->count--; } if (op->flags & STARTTIMER) bcm_tx_start_timer(op); return msg_head->nframes * op->cfsiz + MHSIZ; } /* * bcm_rx_setup - create or update a bcm rx op (for bcm_sendmsg) */ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, int ifindex, struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; int do_rx_register; int err = 0; if ((msg_head->flags & RX_FILTER_ID) || (!(msg_head->nframes))) { /* be robust against wrong usage ... */ msg_head->flags |= RX_FILTER_ID; /* ignore trailing garbage */ msg_head->nframes = 0; } /* the first element contains the mux-mask => MAX_NFRAMES + 1 */ if (msg_head->nframes > MAX_NFRAMES + 1) return -EINVAL; if ((msg_head->flags & RX_RTR_FRAME) && ((msg_head->nframes != 1) || (!(msg_head->can_id & CAN_RTR_FLAG)))) return -EINVAL; /* check timeval limitations */ if ((msg_head->flags & SETTIMER) && bcm_is_invalid_tv(msg_head)) return -EINVAL; /* check the given can_id */ op = bcm_find_op(&bo->rx_ops, msg_head, ifindex); if (op) { /* update existing BCM operation */ /* * Do we need more space for the CAN frames than currently * allocated? -> This is a _really_ unusual use-case and * therefore (complexity / locking) it is not supported. */ if (msg_head->nframes > op->nframes) return -E2BIG; if (msg_head->nframes) { /* update CAN frames content */ err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) return err; /* clear last_frames to indicate 'nothing received' */ memset(op->last_frames, 0, msg_head->nframes * op->cfsiz); } op->nframes = msg_head->nframes; op->flags = msg_head->flags; /* Only an update -> do not call can_rx_register() */ do_rx_register = 0; } else { /* insert new BCM operation for the given can_id */ op = kzalloc(OPSIZ, GFP_KERNEL); if (!op) return -ENOMEM; op->can_id = msg_head->can_id; op->nframes = msg_head->nframes; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; if (msg_head->nframes > 1) { /* create array for CAN frames and copy the data */ op->frames = kmalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->frames) { kfree(op); return -ENOMEM; } /* create and init array for received CAN frames */ op->last_frames = kzalloc(msg_head->nframes * op->cfsiz, GFP_KERNEL); if (!op->last_frames) { kfree(op->frames); kfree(op); return -ENOMEM; } } else { op->frames = &op->sframe; op->last_frames = &op->last_sframe; } if (msg_head->nframes) { err = memcpy_from_msg(op->frames, msg, msg_head->nframes * op->cfsiz); if (err < 0) { if (op->frames != &op->sframe) kfree(op->frames); if (op->last_frames != &op->last_sframe) kfree(op->last_frames); kfree(op); return err; } } /* bcm_can_tx / bcm_tx_timeout_handler needs this */ op->sk = sk; op->ifindex = ifindex; /* ifindex for timeout events w/o previous frame reception */ op->rx_ifindex = ifindex; /* initialize uninitialized (kzalloc) structure */ hrtimer_init(&op->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); op->timer.function = bcm_rx_timeout_handler; /* initialize tasklet for rx timeout notification */ tasklet_init(&op->tsklet, bcm_rx_timeout_tsklet, (unsigned long) op); hrtimer_init(&op->thrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); op->thrtimer.function = bcm_rx_thr_handler; /* initialize tasklet for rx throttle handling */ tasklet_init(&op->thrtsklet, bcm_rx_thr_tsklet, (unsigned long) op); /* add this bcm_op to the list of the rx_ops */ list_add(&op->list, &bo->rx_ops); /* call can_rx_register() */ do_rx_register = 1; } /* if ((op = bcm_find_op(&bo->rx_ops, msg_head->can_id, ifindex))) */ /* check flags */ if (op->flags & RX_RTR_FRAME) { struct canfd_frame *frame0 = op->frames; /* no timers in RTR-mode */ hrtimer_cancel(&op->thrtimer); hrtimer_cancel(&op->timer); /* * funny feature in RX(!)_SETUP only for RTR-mode: * copy can_id into frame BUT without RTR-flag to * prevent a full-load-loopback-test ... ;-] */ if ((op->flags & TX_CP_CAN_ID) || (frame0->can_id == op->can_id)) frame0->can_id = op->can_id & ~CAN_RTR_FLAG; } else { if (op->flags & SETTIMER) { /* set timer value */ op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); op->kt_ival2 = bcm_timeval_to_ktime(msg_head->ival2); /* disable an active timer due to zero value? */ if (!op->kt_ival1) hrtimer_cancel(&op->timer); /* * In any case cancel the throttle timer, flush * potentially blocked msgs and reset throttle handling */ op->kt_lastmsg = 0; hrtimer_cancel(&op->thrtimer); bcm_rx_thr_flush(op, 1); } if ((op->flags & STARTTIMER) && op->kt_ival1) hrtimer_start(&op->timer, op->kt_ival1, HRTIMER_MODE_REL); } /* now we can register for can_ids, if we added a new bcm_op */ if (do_rx_register) { if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (dev) { err = can_rx_register(sock_net(sk), dev, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); op->rx_reg_dev = dev; dev_put(dev); } } else err = can_rx_register(sock_net(sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ list_del(&op->list); bcm_remove_op(op); return err; } } return msg_head->nframes * op->cfsiz + MHSIZ; } /* * bcm_tx_send - send a single CAN frame to the CAN interface (for bcm_sendmsg) */ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, int cfsiz) { struct sk_buff *skb; struct net_device *dev; int err; /* we need a real device to send frames */ if (!ifindex) return -ENODEV; skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); if (!skb) return -ENOMEM; can_skb_reserve(skb); err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { kfree_skb(skb); return err; } dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) { kfree_skb(skb); return -ENODEV; } can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb->dev = dev; can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ dev_put(dev); if (err) return err; return cfsiz + MHSIZ; } /* * bcm_sendmsg - process BCM commands (opcodes) from the userspace */ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); int ifindex = bo->ifindex; /* default ifindex for this bcm_op */ struct bcm_msg_head msg_head; int cfsiz; int ret; /* read bytes or error codes as return value */ if (!bo->bound) return -ENOTCONN; /* check for valid message length from userspace */ if (size < MHSIZ) return -EINVAL; /* read message head information */ ret = memcpy_from_msg((u8 *)&msg_head, msg, MHSIZ); if (ret < 0) return ret; cfsiz = CFSIZ(msg_head.flags); if ((size - MHSIZ) % cfsiz) return -EINVAL; /* check for alternative ifindex for this bcm_op */ if (!ifindex && msg->msg_name) { /* no bound device as default => check msg_name */ DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); if (msg->msg_namelen < sizeof(*addr)) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; /* ifindex from sendto() */ ifindex = addr->can_ifindex; if (ifindex) { struct net_device *dev; dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENODEV; if (dev->type != ARPHRD_CAN) { dev_put(dev); return -ENODEV; } dev_put(dev); } } lock_sock(sk); switch (msg_head.opcode) { case TX_SETUP: ret = bcm_tx_setup(&msg_head, msg, ifindex, sk); break; case RX_SETUP: ret = bcm_rx_setup(&msg_head, msg, ifindex, sk); break; case TX_DELETE: if (bcm_delete_tx_op(&bo->tx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case RX_DELETE: if (bcm_delete_rx_op(&bo->rx_ops, &msg_head, ifindex)) ret = MHSIZ; else ret = -EINVAL; break; case TX_READ: /* reuse msg_head for the reply to TX_READ */ msg_head.opcode = TX_STATUS; ret = bcm_read_op(&bo->tx_ops, &msg_head, ifindex); break; case RX_READ: /* reuse msg_head for the reply to RX_READ */ msg_head.opcode = RX_STATUS; ret = bcm_read_op(&bo->rx_ops, &msg_head, ifindex); break; case TX_SEND: /* we need exactly one CAN frame behind the msg head */ if ((msg_head.nframes != 1) || (size != cfsiz + MHSIZ)) ret = -EINVAL; else ret = bcm_tx_send(msg, ifindex, sk, cfsiz); break; default: ret = -EINVAL; break; } release_sock(sk); return ret; } /* * notification handler for netdevice status changes */ static void bcm_notify(struct bcm_sock *bo, unsigned long msg, struct net_device *dev) { struct sock *sk = &bo->sk; struct bcm_op *op; int notify_enodev = 0; if (!net_eq(dev_net(dev), sock_net(sk))) return; switch (msg) { case NETDEV_UNREGISTER: lock_sock(sk); /* remove device specific receive entries */ list_for_each_entry(op, &bo->rx_ops, list) if (op->rx_reg_dev == dev) bcm_rx_unreg(dev, op); /* remove device reference, if this is our bound device */ if (bo->bound && bo->ifindex == dev->ifindex) { bo->bound = 0; bo->ifindex = 0; notify_enodev = 1; } release_sock(sk); if (notify_enodev) { sk->sk_err = ENODEV; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); } break; case NETDEV_DOWN: if (bo->bound && bo->ifindex == dev->ifindex) { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); } } } static int bcm_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; if (msg != NETDEV_UNREGISTER && msg != NETDEV_DOWN) return NOTIFY_DONE; if (unlikely(bcm_busy_notifier)) /* Check for reentrant bug. */ return NOTIFY_DONE; spin_lock(&bcm_notifier_lock); list_for_each_entry(bcm_busy_notifier, &bcm_notifier_list, notifier) { spin_unlock(&bcm_notifier_lock); bcm_notify(bcm_busy_notifier, msg, dev); spin_lock(&bcm_notifier_lock); } bcm_busy_notifier = NULL; spin_unlock(&bcm_notifier_lock); return NOTIFY_DONE; } /* * initial settings for all BCM sockets to be set at socket creation time */ static int bcm_init(struct sock *sk) { struct bcm_sock *bo = bcm_sk(sk); bo->bound = 0; bo->ifindex = 0; bo->dropped_usr_msgs = 0; bo->bcm_proc_read = NULL; INIT_LIST_HEAD(&bo->tx_ops); INIT_LIST_HEAD(&bo->rx_ops); /* set notifier */ spin_lock(&bcm_notifier_lock); list_add_tail(&bo->notifier, &bcm_notifier_list); spin_unlock(&bcm_notifier_lock); return 0; } /* * standard socket functions */ static int bcm_release(struct socket *sock) { struct sock *sk = sock->sk; struct net *net; struct bcm_sock *bo; struct bcm_op *op, *next; if (!sk) return 0; net = sock_net(sk); bo = bcm_sk(sk); /* remove bcm_ops, timer, rx_unregister(), etc. */ spin_lock(&bcm_notifier_lock); while (bcm_busy_notifier == bo) { spin_unlock(&bcm_notifier_lock); schedule_timeout_uninterruptible(1); spin_lock(&bcm_notifier_lock); } list_del(&bo->notifier); spin_unlock(&bcm_notifier_lock); lock_sock(sk); list_for_each_entry_safe(op, next, &bo->tx_ops, list) bcm_remove_op(op); list_for_each_entry_safe(op, next, &bo->rx_ops, list) { /* * Don't care if we're bound or not (due to netdev problems) * can_rx_unregister() is always a save thing to do here. */ if (op->ifindex) { /* * Only remove subscriptions that had not * been removed due to NETDEV_UNREGISTER * in bcm_notifier() */ if (op->rx_reg_dev) { struct net_device *dev; dev = dev_get_by_index(net, op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else can_rx_unregister(net, NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); } synchronize_rcu(); list_for_each_entry_safe(op, next, &bo->rx_ops, list) bcm_remove_op(op); #if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ if (net->can.bcmproc_dir && bo->bcm_proc_read) remove_proc_entry(bo->procname, net->can.bcmproc_dir); #endif /* CONFIG_PROC_FS */ /* remove device reference */ if (bo->bound) { bo->bound = 0; bo->ifindex = 0; } sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_put(sk); return 0; } static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); struct net *net = sock_net(sk); int ret = 0; if (len < sizeof(*addr)) return -EINVAL; lock_sock(sk); if (bo->bound) { ret = -EISCONN; goto fail; } /* bind a device to this socket */ if (addr->can_ifindex) { struct net_device *dev; dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { ret = -ENODEV; goto fail; } if (dev->type != ARPHRD_CAN) { dev_put(dev); ret = -ENODEV; goto fail; } bo->ifindex = dev->ifindex; dev_put(dev); } else { /* no interface reference for ifindex = 0 ('any' CAN device) */ bo->ifindex = 0; } #if IS_ENABLED(CONFIG_PROC_FS) if (net->can.bcmproc_dir) { /* unique socket address as filename */ sprintf(bo->procname, "%lu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_data(bo->procname, 0644, net->can.bcmproc_dir, &bcm_proc_fops, sk); if (!bo->bcm_proc_read) { ret = -ENOMEM; goto fail; } } #endif /* CONFIG_PROC_FS */ bo->bound = 1; fail: release_sock(sk); return ret; } static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int error = 0; int noblock; int err; noblock = flags & MSG_DONTWAIT; flags &= ~MSG_DONTWAIT; skb = skb_recv_datagram(sk, flags, noblock, &error); if (!skb) return error; if (skb->len < size) size = skb->len; err = memcpy_to_msg(msg, skb->data, size); if (err < 0) { skb_free_datagram(sk, skb); return err; } sock_recv_ts_and_drops(msg, sk, skb); if (msg->msg_name) { __sockaddr_check_size(sizeof(struct sockaddr_can)); msg->msg_namelen = sizeof(struct sockaddr_can); memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } skb_free_datagram(sk, skb); return size; } static const struct proto_ops bcm_ops = { .family = PF_CAN, .release = bcm_release, .bind = sock_no_bind, .connect = bcm_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = can_ioctl, /* use can_ioctl() from af_can.c */ .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, .sendmsg = bcm_sendmsg, .recvmsg = bcm_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; static struct proto bcm_proto __read_mostly = { .name = "CAN_BCM", .owner = THIS_MODULE, .obj_size = sizeof(struct bcm_sock), .init = bcm_init, }; static const struct can_proto bcm_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_BCM, .ops = &bcm_ops, .prot = &bcm_proto, }; static int canbcm_pernet_init(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* create /proc/net/can-bcm directory */ net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ return 0; } static void canbcm_pernet_exit(struct net *net) { #if IS_ENABLED(CONFIG_PROC_FS) /* remove /proc/net/can-bcm directory */ if (net->can.bcmproc_dir) remove_proc_entry("can-bcm", net->proc_net); #endif /* CONFIG_PROC_FS */ } static struct pernet_operations canbcm_pernet_ops __read_mostly = { .init = canbcm_pernet_init, .exit = canbcm_pernet_exit, }; static struct notifier_block canbcm_notifier = { .notifier_call = bcm_notifier }; static int __init bcm_module_init(void) { int err; pr_info("can: broadcast manager protocol (rev " CAN_BCM_VERSION " t)\n"); err = can_proto_register(&bcm_can_proto); if (err < 0) { printk(KERN_ERR "can: registration of bcm protocol failed\n"); return err; } register_pernet_subsys(&canbcm_pernet_ops); register_netdevice_notifier(&canbcm_notifier); return 0; } static void __exit bcm_module_exit(void) { can_proto_unregister(&bcm_can_proto); unregister_netdevice_notifier(&canbcm_notifier); unregister_pernet_subsys(&canbcm_pernet_ops); } module_init(bcm_module_init); module_exit(bcm_module_exit);
159 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 /* * Internal Header for the Direct Rendering Manager * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * Copyright (c) 2009-2010, Code Aurora Forum. * All rights reserved. * * Author: Rickard E. (Rik) Faith <faith@valinux.com> * Author: Gareth Hughes <gareth@valinux.com> * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #ifndef _DRM_P_H_ #define _DRM_P_H_ #include <linux/agp_backend.h> #include <linux/cdev.h> #include <linux/dma-mapping.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/highmem.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/io.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> #include <linux/miscdevice.h> #include <linux/mm.h> #include <linux/mutex.h> #include <linux/platform_device.h> #include <linux/poll.h> #include <linux/ratelimit.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/types.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/dma-fence.h> #include <linux/module.h> #include <asm/mman.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <uapi/drm/drm.h> #include <uapi/drm/drm_mode.h> #include <drm/drm_agpsupport.h> #include <drm/drm_crtc.h> #include <drm/drm_fourcc.h> #include <drm/drm_global.h> #include <drm/drm_hashtab.h> #include <drm/drm_mm.h> #include <drm/drm_os_linux.h> #include <drm/drm_sarea.h> #include <drm/drm_drv.h> #include <drm/drm_prime.h> #include <drm/drm_pci.h> #include <drm/drm_file.h> #include <drm/drm_debugfs.h> #include <drm/drm_ioctl.h> #include <drm/drm_sysfs.h> #include <drm/drm_vblank.h> #include <drm/drm_irq.h> #include <drm/drm_device.h> struct module; struct device_node; struct videomode; struct reservation_object; struct dma_buf_attachment; struct pci_dev; struct pci_controller; /* * The following categories are defined: * * CORE: Used in the generic drm code: drm_ioctl.c, drm_mm.c, drm_memory.c, ... * This is the category used by the DRM_DEBUG() macro. * * DRIVER: Used in the vendor specific part of the driver: i915, radeon, ... * This is the category used by the DRM_DEBUG_DRIVER() macro. * * KMS: used in the modesetting code. * This is the category used by the DRM_DEBUG_KMS() macro. * * PRIME: used in the prime code. * This is the category used by the DRM_DEBUG_PRIME() macro. * * ATOMIC: used in the atomic code. * This is the category used by the DRM_DEBUG_ATOMIC() macro. * * VBL: used for verbose debug message in the vblank code * This is the category used by the DRM_DEBUG_VBL() macro. * * Enabling verbose debug messages is done through the drm.debug parameter, * each category being enabled by a bit. * * drm.debug=0x1 will enable CORE messages * drm.debug=0x2 will enable DRIVER messages * drm.debug=0x3 will enable CORE and DRIVER messages * ... * drm.debug=0x3f will enable all messages * * An interesting feature is that it's possible to enable verbose logging at * run-time by echoing the debug value in its sysfs node: * # echo 0xf > /sys/module/drm/parameters/debug */ #define DRM_UT_NONE 0x00 #define DRM_UT_CORE 0x01 #define DRM_UT_DRIVER 0x02 #define DRM_UT_KMS 0x04 #define DRM_UT_PRIME 0x08 #define DRM_UT_ATOMIC 0x10 #define DRM_UT_VBL 0x20 #define DRM_UT_STATE 0x40 /***********************************************************************/ /** \name DRM template customization defaults */ /*@{*/ /***********************************************************************/ /** \name Macros to make printk easier */ /*@{*/ #define _DRM_PRINTK(once, level, fmt, ...) \ do { \ printk##once(KERN_##level "[" DRM_NAME "] " fmt, \ ##__VA_ARGS__); \ } while (0) #define DRM_INFO(fmt, ...) \ _DRM_PRINTK(, INFO, fmt, ##__VA_ARGS__) #define DRM_NOTE(fmt, ...) \ _DRM_PRINTK(, NOTICE, fmt, ##__VA_ARGS__) #define DRM_WARN(fmt, ...) \ _DRM_PRINTK(, WARNING, fmt, ##__VA_ARGS__) #define DRM_INFO_ONCE(fmt, ...) \ _DRM_PRINTK(_once, INFO, fmt, ##__VA_ARGS__) #define DRM_NOTE_ONCE(fmt, ...) \ _DRM_PRINTK(_once, NOTICE, fmt, ##__VA_ARGS__) #define DRM_WARN_ONCE(fmt, ...) \ _DRM_PRINTK(_once, WARNING, fmt, ##__VA_ARGS__) /** * Error output. * * \param fmt printf() like format string. * \param arg arguments */ #define DRM_DEV_ERROR(dev, fmt, ...) \ drm_dev_printk(dev, KERN_ERR, DRM_UT_NONE, __func__, " *ERROR*",\ fmt, ##__VA_ARGS__) #define DRM_ERROR(fmt, ...) \ drm_printk(KERN_ERR, DRM_UT_NONE, fmt, ##__VA_ARGS__) /** * Rate limited error output. Like DRM_ERROR() but won't flood the log. * * \param fmt printf() like format string. * \param arg arguments */ #define DRM_DEV_ERROR_RATELIMITED(dev, fmt, ...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ \ if (__ratelimit(&_rs)) \ DRM_DEV_ERROR(dev, fmt, ##__VA_ARGS__); \ }) #define DRM_ERROR_RATELIMITED(fmt, ...) \ DRM_DEV_ERROR_RATELIMITED(NULL, fmt, ##__VA_ARGS__) #define DRM_DEV_INFO(dev, fmt, ...) \ drm_dev_printk(dev, KERN_INFO, DRM_UT_NONE, __func__, "", fmt, \ ##__VA_ARGS__) #define DRM_DEV_INFO_ONCE(dev, fmt, ...) \ ({ \ static bool __print_once __read_mostly; \ if (!__print_once) { \ __print_once = true; \ DRM_DEV_INFO(dev, fmt, ##__VA_ARGS__); \ } \ }) /** * Debug output. * * \param fmt printf() like format string. * \param arg arguments */ #define DRM_DEV_DEBUG(dev, fmt, args...) \ drm_dev_printk(dev, KERN_DEBUG, DRM_UT_CORE, __func__, "", fmt, \ ##args) #define DRM_DEBUG(fmt, ...) \ drm_printk(KERN_DEBUG, DRM_UT_CORE, fmt, ##__VA_ARGS__) #define DRM_DEV_DEBUG_DRIVER(dev, fmt, args...) \ drm_dev_printk(dev, KERN_DEBUG, DRM_UT_DRIVER, __func__, "", \ fmt, ##args) #define DRM_DEBUG_DRIVER(fmt, ...) \ drm_printk(KERN_DEBUG, DRM_UT_DRIVER, fmt, ##__VA_ARGS__) #define DRM_DEV_DEBUG_KMS(dev, fmt, args...) \ drm_dev_printk(dev, KERN_DEBUG, DRM_UT_KMS, __func__, "", fmt, \ ##args) #define DRM_DEBUG_KMS(fmt, ...) \ drm_printk(KERN_DEBUG, DRM_UT_KMS, fmt, ##__VA_ARGS__) #define DRM_DEV_DEBUG_PRIME(dev, fmt, args...) \ drm_dev_printk(dev, KERN_DEBUG, DRM_UT_PRIME, __func__, "", \ fmt, ##args) #define DRM_DEBUG_PRIME(fmt, ...) \ drm_printk(KERN_DEBUG, DRM_UT_PRIME, fmt, ##__VA_ARGS__) #define DRM_DEV_DEBUG_ATOMIC(dev, fmt, args...) \ drm_dev_printk(dev, KERN_DEBUG, DRM_UT_ATOMIC, __func__, "", \ fmt, ##args) #define DRM_DEBUG_ATOMIC(fmt, ...) \ drm_printk(KERN_DEBUG, DRM_UT_ATOMIC, fmt, ##__VA_ARGS__) #define DRM_DEV_DEBUG_VBL(dev, fmt, args...) \ drm_dev_printk(dev, KERN_DEBUG, DRM_UT_VBL, __func__, "", fmt, \ ##args) #define DRM_DEBUG_VBL(fmt, ...) \ drm_printk(KERN_DEBUG, DRM_UT_VBL, fmt, ##__VA_ARGS__) #define _DRM_DEV_DEFINE_DEBUG_RATELIMITED(dev, level, fmt, args...) \ ({ \ static DEFINE_RATELIMIT_STATE(_rs, \ DEFAULT_RATELIMIT_INTERVAL, \ DEFAULT_RATELIMIT_BURST); \ if (__ratelimit(&_rs)) \ drm_dev_printk(dev, KERN_DEBUG, DRM_UT_ ## level, \ __func__, "", fmt, ##args); \ }) /** * Rate limited debug output. Like DRM_DEBUG() but won't flood the log. * * \param fmt printf() like format string. * \param arg arguments */ #define DRM_DEV_DEBUG_RATELIMITED(dev, fmt, args...) \ DEV__DRM_DEFINE_DEBUG_RATELIMITED(dev, CORE, fmt, ##args) #define DRM_DEBUG_RATELIMITED(fmt, args...) \ DRM_DEV_DEBUG_RATELIMITED(NULL, fmt, ##args) #define DRM_DEV_DEBUG_DRIVER_RATELIMITED(dev, fmt, args...) \ _DRM_DEV_DEFINE_DEBUG_RATELIMITED(dev, DRIVER, fmt, ##args) #define DRM_DEBUG_DRIVER_RATELIMITED(fmt, args...) \ DRM_DEV_DEBUG_DRIVER_RATELIMITED(NULL, fmt, ##args) #define DRM_DEV_DEBUG_KMS_RATELIMITED(dev, fmt, args...) \ _DRM_DEV_DEFINE_DEBUG_RATELIMITED(dev, KMS, fmt, ##args) #define DRM_DEBUG_KMS_RATELIMITED(fmt, args...) \ DRM_DEV_DEBUG_KMS_RATELIMITED(NULL, fmt, ##args) #define DRM_DEV_DEBUG_PRIME_RATELIMITED(dev, fmt, args...) \ _DRM_DEV_DEFINE_DEBUG_RATELIMITED(dev, PRIME, fmt, ##args) #define DRM_DEBUG_PRIME_RATELIMITED(fmt, args...) \ DRM_DEV_DEBUG_PRIME_RATELIMITED(NULL, fmt, ##args) /* Format strings and argument splitters to simplify printing * various "complex" objects */ /*@}*/ /***********************************************************************/ /** \name Internal types and structures */ /*@{*/ #define DRM_IF_VERSION(maj, min) (maj << 16 | min) /** * drm_drv_uses_atomic_modeset - check if the driver implements * atomic_commit() * @dev: DRM device * * This check is useful if drivers do not have DRIVER_ATOMIC set but * have atomic modesetting internally implemented. */ static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev) { return dev->mode_config.funcs->atomic_commit != NULL; } #define DRM_SWITCH_POWER_ON 0 #define DRM_SWITCH_POWER_OFF 1 #define DRM_SWITCH_POWER_CHANGING 2 #define DRM_SWITCH_POWER_DYNAMIC_OFF 3 static __inline__ int drm_core_check_feature(struct drm_device *dev, int feature) { return ((dev->driver->driver_features & feature) ? 1 : 0); } /******************************************************************/ /** \name Internal function definitions */ /*@{*/ /* Driver support (drm_drv.h) */ /* * These are exported to drivers so that they can implement fencing using * DMA quiscent + idle. DMA quiescent usually requires the hardware lock. */ /*@}*/ /* returns true if currently okay to sleep */ static __inline__ bool drm_can_sleep(void) { if (in_atomic() || in_dbg_master() || irqs_disabled()) return false; return true; } /* helper for handling conditionals in various for_each macros */ #define for_each_if(condition) if (!(condition)) {} else #endif
1271 2494 2493 2495 2457 8 21 5 5 2495 2495 127 2610 2611 1268 2609 970 2610 2070 2495 2496 2495 2614 2614 44 44 44 44 1651 1650 1651 1 1 1650 1650 5240 73 60 5105 5121 5103 5240 5239 857 856 857 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 /* * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, * for the blk-mq scheduling framework * * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> */ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/elevator.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/sbitmap.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" #include "blk-mq-sched.h" /* * See Documentation/block/deadline-iosched.txt */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ struct deadline_data { /* * run time data */ /* * requests (deadline_rq s) are present on both sort_list and fifo_list */ struct rb_root sort_list[2]; struct list_head fifo_list[2]; /* * next in sort order. read, write or both are NULL */ struct request *next_rq[2]; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ int fifo_expire[2]; int fifo_batch; int writes_starved; int front_merges; spinlock_t lock; struct list_head dispatch; }; static inline struct rb_root * deadline_rb_root(struct deadline_data *dd, struct request *rq) { return &dd->sort_list[rq_data_dir(rq)]; } /* * get the request after `rq' in sector-sorted order */ static inline struct request * deadline_latter_request(struct request *rq) { struct rb_node *node = rb_next(&rq->rb_node); if (node) return rb_entry_rq(node); return NULL; } static void deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) { struct rb_root *root = deadline_rb_root(dd, rq); elv_rb_add(root, rq); } static inline void deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) { const int data_dir = rq_data_dir(rq); if (dd->next_rq[data_dir] == rq) dd->next_rq[data_dir] = deadline_latter_request(rq); elv_rb_del(deadline_rb_root(dd, rq), rq); } /* * remove rq from rbtree and fifo. */ static void deadline_remove_request(struct request_queue *q, struct request *rq) { struct deadline_data *dd = q->elevator->elevator_data; list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) deadline_del_rq_rb(dd, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) q->last_merge = NULL; } static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { elv_rb_del(deadline_rb_root(dd, req), req); deadline_add_rq_rb(dd, req); } } static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo */ if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { if (time_before((unsigned long)next->fifo_time, (unsigned long)req->fifo_time)) { list_move(&req->queuelist, &next->queuelist); req->fifo_time = next->fifo_time; } } /* * kill knowledge of next, this one is a goner */ deadline_remove_request(q, next); } /* * move an entry to dispatch queue */ static void deadline_move_request(struct deadline_data *dd, struct request *rq) { const int data_dir = rq_data_dir(rq); dd->next_rq[READ] = NULL; dd->next_rq[WRITE] = NULL; dd->next_rq[data_dir] = deadline_latter_request(rq); /* * take it off the sort and fifo list */ deadline_remove_request(rq->q, rq); } /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) */ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) { struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); /* * rq is expired! */ if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) return 1; return 0; } /* * deadline_dispatch_requests selects the best request according to * read/write expire, fifo_batch, etc */ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; bool reads, writes; int data_dir; if (!list_empty(&dd->dispatch)) { rq = list_first_entry(&dd->dispatch, struct request, queuelist); list_del_init(&rq->queuelist); goto done; } reads = !list_empty(&dd->fifo_list[READ]); writes = !list_empty(&dd->fifo_list[WRITE]); /* * batches are currently reads XOR writes */ if (dd->next_rq[WRITE]) rq = dd->next_rq[WRITE]; else rq = dd->next_rq[READ]; if (rq && dd->batching < dd->fifo_batch) /* we have a next request are still entitled to batch */ goto dispatch_request; /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ if (reads) { BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); if (writes && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; data_dir = READ; goto dispatch_find_request; } /* * there are either no reads or writes have been starved */ if (writes) { dispatch_writes: BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); dd->starved = 0; data_dir = WRITE; goto dispatch_find_request; } return NULL; dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ rq = rq_entry_fifo(dd->fifo_list[data_dir].next); } else { /* * The last req was the same dir and we have a next request in * sort order. No expired requests so continue on from here. */ rq = dd->next_rq[data_dir]; } dd->batching = 0; dispatch_request: /* * rq is the selected appropriate request. */ dd->batching++; deadline_move_request(dd, rq); done: rq->rq_flags |= RQF_STARTED; return rq; } static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct request *rq; spin_lock(&dd->lock); rq = __dd_dispatch_request(hctx); spin_unlock(&dd->lock); return rq; } static void dd_exit_queue(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; BUG_ON(!list_empty(&dd->fifo_list[READ])); BUG_ON(!list_empty(&dd->fifo_list[WRITE])); kfree(dd); } /* * initialize elevator private data (deadline_data). */ static int dd_init_queue(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; eq = elevator_alloc(q, e); if (!eq) return -ENOMEM; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); if (!dd) { kobject_put(&eq->kobj); return -ENOMEM; } eq->elevator_data = dd; INIT_LIST_HEAD(&dd->fifo_list[READ]); INIT_LIST_HEAD(&dd->fifo_list[WRITE]); dd->sort_list[READ] = RB_ROOT; dd->sort_list[WRITE] = RB_ROOT; dd->fifo_expire[READ] = read_expire; dd->fifo_expire[WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; dd->fifo_batch = fifo_batch; spin_lock_init(&dd->lock); INIT_LIST_HEAD(&dd->dispatch); q->elevator = eq; return 0; } static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; return ELEVATOR_FRONT_MERGE; } } return ELEVATOR_NO_MERGE; } static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; spin_lock(&dd->lock); ret = blk_mq_sched_try_merge(q, bio, &free); spin_unlock(&dd->lock); if (free) blk_mq_free_request(free); return ret; } /* * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, bool at_head) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; const int data_dir = rq_data_dir(rq); if (blk_mq_sched_try_insert_merge(q, rq)) return; blk_mq_sched_request_inserted(rq); if (at_head || blk_rq_is_passthrough(rq)) { if (at_head) list_add(&rq->queuelist, &dd->dispatch); else list_add_tail(&rq->queuelist, &dd->dispatch); } else { deadline_add_rq_rb(dd, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); if (!q->last_merge) q->last_merge = rq; } /* * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); } } static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list, bool at_head) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; spin_lock(&dd->lock); while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); dd_insert_request(hctx, rq, at_head); } spin_unlock(&dd->lock); } static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; return !list_empty_careful(&dd->dispatch) || !list_empty_careful(&dd->fifo_list[0]) || !list_empty_careful(&dd->fifo_list[1]); } /* * sysfs parts below */ static ssize_t deadline_var_show(int var, char *page) { return sprintf(page, "%d\n", var); } static void deadline_var_store(int *var, const char *page) { char *p = (char *) page; *var = simple_strtol(p, &p, 10); } #define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data = __VAR; \ if (__CONV) \ __data = jiffies_to_msecs(__data); \ return deadline_var_show(__data, (page)); \ } SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ int __data; \ deadline_var_store(&__data, (page)); \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ if (__CONV) \ *(__PTR) = msecs_to_jiffies(__data); \ else \ *(__PTR) = __data; \ return count; \ } STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); #undef STORE_FUNCTION #define DD_ATTR(name) \ __ATTR(name, S_IRUGO|S_IWUSR, deadline_##name##_show, \ deadline_##name##_store) static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(read_expire), DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), DD_ATTR(fifo_batch), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS #define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_lock(&dd->lock); \ return seq_list_start(&dd->fifo_list[ddir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ loff_t *pos) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ return seq_list_next(v, &dd->fifo_list[ddir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ __releases(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ \ spin_unlock(&dd->lock); \ } \ \ static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ .start = deadline_##name##_fifo_start, \ .next = deadline_##name##_fifo_next, \ .stop = deadline_##name##_fifo_stop, \ .show = blk_mq_debugfs_rq_show, \ }; \ \ static int deadline_##name##_next_rq_show(void *data, \ struct seq_file *m) \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct request *rq = dd->next_rq[ddir]; \ \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read) DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write) #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->batching); return 0; } static int deadline_starved_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; seq_printf(m, "%u\n", dd->starved); return 0; } static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) __acquires(&dd->lock) { struct request_queue *q = m->private; struct deadline_data *dd = q->elevator->elevator_data; spin_lock(&dd->lock); return seq_list_start(&dd->dispatch, *pos); } static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) { struct request_queue *q = m->private; struct deadline_data *dd = q->elevator->elevator_data; return seq_list_next(v, &dd->dispatch, pos); } static void deadline_dispatch_stop(struct seq_file *m, void *v) __releases(&dd->lock) { struct request_queue *q = m->private; struct deadline_data *dd = q->elevator->elevator_data; spin_unlock(&dd->lock); } static const struct seq_operations deadline_dispatch_seq_ops = { .start = deadline_dispatch_start, .next = deadline_dispatch_next, .stop = deadline_dispatch_stop, .show = blk_mq_debugfs_rq_show, }; #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { DEADLINE_QUEUE_DDIR_ATTRS(read), DEADLINE_QUEUE_DDIR_ATTRS(write), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS #endif static struct elevator_type mq_deadline = { .ops.mq = { .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .next_request = elv_rb_latter_request, .former_request = elv_rb_former_request, .bio_merge = dd_bio_merge, .request_merge = dd_request_merge, .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, .init_sched = dd_init_queue, .exit_sched = dd_exit_queue, }, .uses_mq = true, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); static int __init deadline_init(void) { return elv_register(&mq_deadline); } static void __exit deadline_exit(void) { elv_unregister(&mq_deadline); } module_init(deadline_init); module_exit(deadline_exit); MODULE_AUTHOR("Jens Axboe"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler");
143 143 143 143 142 142 37 1 141 143 143 141 73 72 72 72 13 2 2 1 13 72 63 2 63 72 2 72 138 8 135 135 135 135 1 1 135 135 78 1 7 6 1 65 65 61 61 65 65 65 2 2 2 2 68 68 68 2 2 68 2 2 68 68 1 68 68 68 68 68 68 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 62 62 62 62 62 33 62 62 62 76 76 34 34 34 34 1 34 34 34 34 1 1 1 1 34 34 4 30 30 30 30 30 8 8 3 7 1 1 1 1 1 2 1 1 2 29 28 1 29 6 34 1 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 /* * Copyright (C) International Business Machines Corp., 2000-2005 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * jfs_xtree.c: extent allocation descriptor B+-tree manager */ #include <linux/fs.h> #include <linux/module.h> #include <linux/quotaops.h> #include <linux/seq_file.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_dmap.h" #include "jfs_dinode.h" #include "jfs_superblock.h" #include "jfs_debug.h" /* * xtree local flag */ #define XT_INSERT 0x00000001 /* * xtree key/entry comparison: extent offset * * return: * -1: k < start of extent * 0: start_of_extent <= k <= end_of_extent * 1: k > end_of_extent */ #define XT_CMP(CMP, K, X, OFFSET64)\ {\ OFFSET64 = offsetXAD(X);\ (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ ((K) < OFFSET64) ? -1 : 0;\ } /* write a xad entry */ #define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ {\ (XAD)->flag = (FLAG);\ XADoffset((XAD), (OFF));\ XADlength((XAD), (LEN));\ XADaddress((XAD), (ADDR));\ } #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) /* get page buffer for specified block address */ /* ToDo: Replace this ugly macro with a function */ #define XT_GETPAGE(IP, BN, MP, SIZE, P, RC) \ do { \ BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot); \ if (!(RC)) { \ if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) || \ (le16_to_cpu((P)->header.nextindex) > \ le16_to_cpu((P)->header.maxentry)) || \ (le16_to_cpu((P)->header.maxentry) > \ (((BN) == 0) ? XTROOTMAXSLOT : PSIZE >> L2XTSLOTSIZE))) { \ jfs_error((IP)->i_sb, \ "XT_GETPAGE: xtree page corrupt\n"); \ BT_PUTPAGE(MP); \ MP = NULL; \ RC = -EIO; \ } \ } \ } while (0) /* for consistency */ #define XT_PUTPAGE(MP) BT_PUTPAGE(MP) #define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) /* xtree entry parameter descriptor */ struct xtsplit { struct metapage *mp; s16 index; u8 flag; s64 off; s64 addr; int len; struct pxdlist *pxdlist; }; /* * statistics */ #ifdef CONFIG_JFS_STATISTICS static struct { uint search; uint fastSearch; uint split; } xtStat; #endif /* * forward references */ static int xtSearch(struct inode *ip, s64 xoff, s64 *next, int *cmpp, struct btstack * btstack, int flag); static int xtSplitUp(tid_t tid, struct inode *ip, struct xtsplit * split, struct btstack * btstack); static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp); static int xtSplitRoot(tid_t tid, struct inode *ip, struct xtsplit * split, struct metapage ** rmpp); #ifdef _STILL_TO_PORT static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, xtpage_t * fp, struct btstack * btstack); static int xtSearchNode(struct inode *ip, xad_t * xad, int *cmpp, struct btstack * btstack, int flag); static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); #endif /* _STILL_TO_PORT */ /* * xtLookup() * * function: map a single page into a physical extent; */ int xtLookup(struct inode *ip, s64 lstart, s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check) { int rc = 0; struct btstack btstack; int cmp; s64 bn; struct metapage *mp; xtpage_t *p; int index; xad_t *xad; s64 next, size, xoff, xend; int xlen; s64 xaddr; *paddr = 0; *plen = llen; if (!no_check) { /* is lookup offset beyond eof ? */ size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> JFS_SBI(ip->i_sb)->l2bsize; if (lstart >= size) return 0; } /* * search for the xad entry covering the logical extent */ //search: if ((rc = xtSearch(ip, lstart, &next, &cmp, &btstack, 0))) { jfs_err("xtLookup: xtSearch returned %d", rc); return rc; } /* * compute the physical extent covering logical extent * * N.B. search may have failed (e.g., hole in sparse file), * and returned the index of the next entry. */ /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* is xad found covering start of logical extent ? * lstart is a page start address, * i.e., lstart cannot start in a hole; */ if (cmp) { if (next) *plen = min(next - lstart, llen); goto out; } /* * lxd covered by xad */ xad = &p->xad[index]; xoff = offsetXAD(xad); xlen = lengthXAD(xad); xend = xoff + xlen; xaddr = addressXAD(xad); /* initialize new pxd */ *pflag = xad->flag; *paddr = xaddr + (lstart - xoff); /* a page must be fully covered by an xad */ *plen = min(xend - lstart, llen); out: XT_PUTPAGE(mp); return rc; } /* * xtSearch() * * function: search for the xad entry covering specified offset. * * parameters: * ip - file object; * xoff - extent offset; * nextp - address of next extent (if any) for search miss * cmpp - comparison result: * btstack - traverse stack; * flag - search process flag (XT_INSERT); * * returns: * btstack contains (bn, index) of search path traversed to the entry. * *cmpp is set to result of comparison with the entry returned. * the page containing the entry is pinned at exit. */ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, int *cmpp, struct btstack * btstack, int flag) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); int rc = 0; int cmp = 1; /* init for empty page */ s64 bn; /* block number */ struct metapage *mp; /* page buffer */ xtpage_t *p; /* page */ xad_t *xad; int base, index, lim, btindex; struct btframe *btsp; int nsplit = 0; /* number of pages to split */ s64 t64; s64 next = 0; INCREMENT(xtStat.search); BT_CLR(btstack); btstack->nsplit = 0; /* * search down tree from root: * * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of * internal page, child page Pi contains entry with k, Ki <= K < Kj. * * if entry with search key K is not found * internal page search find the entry with largest key Ki * less than K which point to the child page to search; * leaf page search find the entry with smallest key Kj * greater than K so that the returned index is the position of * the entry to be shifted right for insertion of new entry. * for empty tree, search key is greater than any key of the tree. * * by convention, root bn = 0. */ for (bn = 0;;) { /* get/pin the page to search */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* try sequential access heuristics with the previous * access entry in target leaf page: * once search narrowed down into the target leaf, * key must either match an entry in the leaf or * key entry does not exist in the tree; */ //fastSearch: if ((jfs_ip->btorder & BT_SEQUENTIAL) && (p->header.flag & BT_LEAF) && (index = jfs_ip->btindex) < le16_to_cpu(p->header.nextindex)) { xad = &p->xad[index]; t64 = offsetXAD(xad); if (xoff < t64 + lengthXAD(xad)) { if (xoff >= t64) { *cmpp = 0; goto out; } /* stop sequential access heuristics */ goto binarySearch; } else { /* (t64 + lengthXAD(xad)) <= xoff */ /* try next sequential entry */ index++; if (index < le16_to_cpu(p->header.nextindex)) { xad++; t64 = offsetXAD(xad); if (xoff < t64 + lengthXAD(xad)) { if (xoff >= t64) { *cmpp = 0; goto out; } /* miss: key falls between * previous and this entry */ *cmpp = 1; next = t64; goto out; } /* (xoff >= t64 + lengthXAD(xad)); * matching entry may be further out: * stop heuristic search */ /* stop sequential access heuristics */ goto binarySearch; } /* (index == p->header.nextindex); * miss: key entry does not exist in * the target leaf/tree */ *cmpp = 1; goto out; } /* * if hit, return index of the entry found, and * if miss, where new entry with search key is * to be inserted; */ out: /* compute number of pages to split */ if (flag & XT_INSERT) { if (p->header.nextindex == /* little-endian */ p->header.maxentry) nsplit++; else nsplit = 0; btstack->nsplit = nsplit; } /* save search result */ btsp = btstack->top; btsp->bn = bn; btsp->index = index; btsp->mp = mp; /* update sequential access heuristics */ jfs_ip->btindex = index; if (nextp) *nextp = next; INCREMENT(xtStat.fastSearch); return 0; } /* well, ... full search now */ binarySearch: lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; /* * binary search with search key K on the current page */ for (base = XTENTRYSTART; lim; lim >>= 1) { index = base + (lim >> 1); XT_CMP(cmp, xoff, &p->xad[index], t64); if (cmp == 0) { /* * search hit */ /* search hit - leaf page: * return the entry found */ if (p->header.flag & BT_LEAF) { *cmpp = cmp; /* compute number of pages to split */ if (flag & XT_INSERT) { if (p->header.nextindex == p->header.maxentry) nsplit++; else nsplit = 0; btstack->nsplit = nsplit; } /* save search result */ btsp = btstack->top; btsp->bn = bn; btsp->index = index; btsp->mp = mp; /* init sequential access heuristics */ btindex = jfs_ip->btindex; if (index == btindex || index == btindex + 1) jfs_ip->btorder = BT_SEQUENTIAL; else jfs_ip->btorder = BT_RANDOM; jfs_ip->btindex = index; return 0; } /* search hit - internal page: * descend/search its child page */ if (index < le16_to_cpu(p->header.nextindex)-1) next = offsetXAD(&p->xad[index + 1]); goto next; } if (cmp > 0) { base = index + 1; --lim; } } /* * search miss * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or maxentry index. */ if (base < le16_to_cpu(p->header.nextindex)) next = offsetXAD(&p->xad[base]); /* * search miss - leaf page: * * return location of entry (base) where new entry with * search key K is to be inserted. */ if (p->header.flag & BT_LEAF) { *cmpp = cmp; /* compute number of pages to split */ if (flag & XT_INSERT) { if (p->header.nextindex == p->header.maxentry) nsplit++; else nsplit = 0; btstack->nsplit = nsplit; } /* save search result */ btsp = btstack->top; btsp->bn = bn; btsp->index = base; btsp->mp = mp; /* init sequential access heuristics */ btindex = jfs_ip->btindex; if (base == btindex || base == btindex + 1) jfs_ip->btorder = BT_SEQUENTIAL; else jfs_ip->btorder = BT_RANDOM; jfs_ip->btindex = base; if (nextp) *nextp = next; return 0; } /* * search miss - non-leaf page: * * if base is non-zero, decrement base by one to get the parent * entry of the child page to search. */ index = base ? base - 1 : base; /* * go down to child page */ next: /* update number of pages to split */ if (p->header.nextindex == p->header.maxentry) nsplit++; else nsplit = 0; /* push (bn, index) of the parent page/entry */ if (BT_STACK_FULL(btstack)) { jfs_error(ip->i_sb, "stack overrun!\n"); XT_PUTPAGE(mp); return -EIO; } BT_PUSH(btstack, bn, index); /* get the child page block number */ bn = addressXAD(&p->xad[index]); /* unpin the parent page */ XT_PUTPAGE(mp); } } /* * xtInsert() * * function: * * parameter: * tid - transaction id; * ip - file object; * xflag - extent flag (XAD_NOTRECORDED): * xoff - extent offset; * xlen - extent length; * xaddrp - extent address pointer (in/out): * if (*xaddrp) * caller allocated data extent at *xaddrp; * else * allocate data extent and return its xaddr; * flag - * * return: */ int xtInsert(tid_t tid, /* transaction id */ struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp, int flag) { int rc = 0; s64 xaddr, hint; struct metapage *mp; /* meta-page buffer */ xtpage_t *p; /* base B+-tree index page */ s64 bn; int index, nextindex; struct btstack btstack; /* traverse stack */ struct xtsplit split; /* split information */ xad_t *xad; int cmp; s64 next; struct tlock *tlck; struct xtlock *xtlck; jfs_info("xtInsert: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); /* * search for the entry location at which to insert: * * xtFastSearch() and xtSearch() both returns (leaf page * pinned, index at which to insert). * n.b. xtSearch() may return index of maxentry of * the full page. */ if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT))) return rc; /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* This test must follow XT_GETSEARCH since mp must be valid if * we branch to out: */ if ((cmp == 0) || (next && (xlen > next - xoff))) { rc = -EEXIST; goto out; } /* * allocate data extent requested * * allocation hint: last xad */ if ((xaddr = *xaddrp) == 0) { if (index > XTENTRYSTART) { xad = &p->xad[index - 1]; hint = addressXAD(xad) + lengthXAD(xad) - 1; } else hint = 0; if ((rc = dquot_alloc_block(ip, xlen))) goto out; if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) { dquot_free_block(ip, xlen); goto out; } } /* * insert entry for new extent */ xflag |= XAD_NEW; /* * if the leaf page is full, split the page and * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ nextindex = le16_to_cpu(p->header.nextindex); if (nextindex == le16_to_cpu(p->header.maxentry)) { split.mp = mp; split.index = index; split.flag = xflag; split.off = xoff; split.len = xlen; split.addr = xaddr; split.pxdlist = NULL; if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { /* undo data extent allocation */ if (*xaddrp == 0) { dbFree(ip, xaddr, (s64) xlen); dquot_free_block(ip, xlen); } return rc; } *xaddrp = xaddr; return 0; } /* * insert the new entry into the leaf page */ /* * acquire a transaction lock on the leaf page; * * action: xad insertion/extension; */ BT_MARK_DIRTY(mp, ip); /* if insert into middle, shift right remaining entries. */ if (index < nextindex) memmove(&p->xad[index + 1], &p->xad[index], (nextindex - index) * sizeof(xad_t)); /* insert the new entry: mark the entry NEW */ xad = &p->xad[index]; XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); /* advance next available entry index */ le16_add_cpu(&p->header.nextindex, 1); /* Don't log it if there are no links to the file */ if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index, (int)xtlck->lwm.offset) : index; xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; } *xaddrp = xaddr; out: /* unpin the leaf page */ XT_PUTPAGE(mp); return rc; } /* * xtSplitUp() * * function: * split full pages as propagating insertion up the tree * * parameter: * tid - transaction id; * ip - file object; * split - entry parameter descriptor; * btstack - traverse stack from xtSearch() * * return: */ static int xtSplitUp(tid_t tid, struct inode *ip, struct xtsplit * split, struct btstack * btstack) { int rc = 0; struct metapage *smp; xtpage_t *sp; /* split page */ struct metapage *rmp; s64 rbn; /* new right page block number */ struct metapage *rcmp; xtpage_t *rcp; /* right child page */ s64 rcbn; /* right child page block number */ int skip; /* index of entry of insertion */ int nextindex; /* next available entry index of p */ struct btframe *parent; /* parent page entry on traverse stack */ xad_t *xad; s64 xaddr; int xlen; int nsplit; /* number of pages split */ struct pxdlist pxdlist; pxd_t *pxd; struct tlock *tlck; struct xtlock *xtlck; smp = split->mp; sp = XT_PAGE(ip, smp); /* is inode xtree root extension/inline EA area free ? */ if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) && (le16_to_cpu(sp->header.maxentry) < XTROOTMAXSLOT) && (JFS_IP(ip)->mode2 & INLINEEA)) { sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT); JFS_IP(ip)->mode2 &= ~INLINEEA; BT_MARK_DIRTY(smp, ip); /* * acquire a transaction lock on the leaf page; * * action: xad insertion/extension; */ /* if insert into middle, shift right remaining entries. */ skip = split->index; nextindex = le16_to_cpu(sp->header.nextindex); if (skip < nextindex) memmove(&sp->xad[skip + 1], &sp->xad[skip], (nextindex - skip) * sizeof(xad_t)); /* insert the new entry: mark the entry NEW */ xad = &sp->xad[skip]; XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); /* advance next available entry index */ le16_add_cpu(&sp->header.nextindex, 1); /* Don't log it if there are no links to the file */ if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; xtlck->lwm.offset = (xtlck->lwm.offset) ? min(skip, (int)xtlck->lwm.offset) : skip; xtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - xtlck->lwm.offset; } return 0; } /* * allocate new index blocks to cover index page split(s) * * allocation hint: ? */ if (split->pxdlist == NULL) { nsplit = btstack->nsplit; split->pxdlist = &pxdlist; pxdlist.maxnpxd = pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; xlen = JFS_SBI(ip->i_sb)->nbperpage; for (; nsplit > 0; nsplit--, pxd++) { if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr)) == 0) { PXDaddress(pxd, xaddr); PXDlength(pxd, xlen); pxdlist.maxnpxd++; continue; } /* undo allocation */ XT_PUTPAGE(smp); return rc; } } /* * Split leaf page <sp> into <sp> and a new right page <rp>. * * The split routines insert the new entry into the leaf page, * and acquire txLock as appropriate. * return <rp> pinned and its block number <rpbn>. */ rc = (sp->header.flag & BT_ROOT) ? xtSplitRoot(tid, ip, split, &rmp) : xtSplitPage(tid, ip, split, &rmp, &rbn); XT_PUTPAGE(smp); if (rc) return -EIO; /* * propagate up the router entry for the leaf page just split * * insert a router entry for the new page into the parent page, * propagate the insert/split up the tree by walking back the stack * of (bn of parent page, index of child page entry in parent page) * that were traversed during the search for the page that split. * * the propagation of insert/split up the tree stops if the root * splits or the page inserted into doesn't have to split to hold * the new entry. * * the parent entry for the split page remains the same, and * a new entry is inserted at its right with the first key and * block number of the new right page. * * There are a maximum of 3 pages pinned at any time: * right child, left parent and right parent (when the parent splits) * to keep the child page pinned while working on the parent. * make sure that all pins are released at exit. */ while ((parent = BT_POP(btstack)) != NULL) { /* parent page specified by stack frame <parent> */ /* keep current child pages <rcp> pinned */ rcmp = rmp; rcbn = rbn; rcp = XT_PAGE(ip, rcmp); /* * insert router entry in parent for new right child page <rp> */ /* get/pin the parent page <sp> */ XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); if (rc) { XT_PUTPAGE(rcmp); return rc; } /* * The new key entry goes ONE AFTER the index of parent entry, * because the split was to the right. */ skip = parent->index + 1; /* * split or shift right remaining entries of the parent page */ nextindex = le16_to_cpu(sp->header.nextindex); /* * parent page is full - split the parent page */ if (nextindex == le16_to_cpu(sp->header.maxentry)) { /* init for parent page split */ split->mp = smp; split->index = skip; /* index at insert */ split->flag = XAD_NEW; split->off = offsetXAD(&rcp->xad[XTENTRYSTART]); split->len = JFS_SBI(ip->i_sb)->nbperpage; split->addr = rcbn; /* unpin previous right child page */ XT_PUTPAGE(rcmp); /* The split routines insert the new entry, * and acquire txLock as appropriate. * return <rp> pinned and its block number <rpbn>. */ rc = (sp->header.flag & BT_ROOT) ? xtSplitRoot(tid, ip, split, &rmp) : xtSplitPage(tid, ip, split, &rmp, &rbn); if (rc) { XT_PUTPAGE(smp); return rc; } XT_PUTPAGE(smp); /* keep new child page <rp> pinned */ } /* * parent page is not full - insert in parent page */ else { /* * insert router entry in parent for the right child * page from the first entry of the right child page: */ /* * acquire a transaction lock on the parent page; * * action: router xad insertion; */ BT_MARK_DIRTY(smp, ip); /* * if insert into middle, shift right remaining entries */ if (skip < nextindex) memmove(&sp->xad[skip + 1], &sp->xad[skip], (nextindex - skip) << L2XTSLOTSIZE); /* insert the router entry */ xad = &sp->xad[skip]; XT_PUTENTRY(xad, XAD_NEW, offsetXAD(&rcp->xad[XTENTRYSTART]), JFS_SBI(ip->i_sb)->nbperpage, rcbn); /* advance next available entry index. */ le16_add_cpu(&sp->header.nextindex, 1); /* Don't log it if there are no links to the file */ if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; xtlck->lwm.offset = (xtlck->lwm.offset) ? min(skip, (int)xtlck->lwm.offset) : skip; xtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - xtlck->lwm.offset; } /* unpin parent page */ XT_PUTPAGE(smp); /* exit propagate up */ break; } } /* unpin current right page */ XT_PUTPAGE(rmp); return 0; } /* * xtSplitPage() * * function: * split a full non-root page into * original/split/left page and new right page * i.e., the original/split page remains as left page. * * parameter: * int tid, * struct inode *ip, * struct xtsplit *split, * struct metapage **rmpp, * u64 *rbnp, * * return: * Pointer to page in which to insert or NULL on error. */ static int xtSplitPage(tid_t tid, struct inode *ip, struct xtsplit * split, struct metapage ** rmpp, s64 * rbnp) { int rc = 0; struct metapage *smp; xtpage_t *sp; struct metapage *rmp; xtpage_t *rp; /* new right page allocated */ s64 rbn; /* new right page block number */ struct metapage *mp; xtpage_t *p; s64 nextbn; int skip, maxentry, middle, righthalf, n; xad_t *xad; struct pxdlist *pxdlist; pxd_t *pxd; struct tlock *tlck; struct xtlock *sxtlck = NULL, *rxtlck = NULL; int quota_allocation = 0; smp = split->mp; sp = XT_PAGE(ip, smp); INCREMENT(xtStat.split); pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) goto clean_up; quota_allocation += lengthPXD(pxd); /* * allocate the new right page for the split */ rmp = get_metapage(ip, rbn, PSIZE, 1); if (rmp == NULL) { rc = -EIO; goto clean_up; } jfs_info("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p", ip, smp, rmp); BT_MARK_DIRTY(rmp, ip); /* * action: new page; */ rp = (xtpage_t *) rmp->data; rp->header.self = *pxd; rp->header.flag = sp->header.flag & BT_TYPE; rp->header.maxentry = sp->header.maxentry; /* little-endian */ rp->header.nextindex = cpu_to_le16(XTENTRYSTART); BT_MARK_DIRTY(smp, ip); /* Don't log it if there are no links to the file */ if (!test_cflag(COMMIT_Nolink, ip)) { /* * acquire a transaction lock on the new right page; */ tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); rxtlck = (struct xtlock *) & tlck->lock; rxtlck->lwm.offset = XTENTRYSTART; /* * acquire a transaction lock on the split page */ tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); sxtlck = (struct xtlock *) & tlck->lock; } /* * initialize/update sibling pointers of <sp> and <rp> */ nextbn = le64_to_cpu(sp->header.next); rp->header.next = cpu_to_le64(nextbn); rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); sp->header.next = cpu_to_le64(rbn); skip = split->index; /* * sequential append at tail (after last entry of last page) * * if splitting the last page on a level because of appending * a entry to it (skip is maxentry), it's likely that the access is * sequential. adding an empty page on the side of the level is less * work and can push the fill factor much higher than normal. * if we're wrong it's no big deal - we will do the split the right * way next time. * (it may look like it's equally easy to do a similar hack for * reverse sorted data, that is, split the tree left, but it's not. * Be my guest.) */ if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) { /* * acquire a transaction lock on the new/right page; * * action: xad insertion; */ /* insert entry at the first entry of the new right page */ xad = &rp->xad[XTENTRYSTART]; XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); if (!test_cflag(COMMIT_Nolink, ip)) { /* rxtlck->lwm.offset = XTENTRYSTART; */ rxtlck->lwm.length = 1; } *rmpp = rmp; *rbnp = rbn; jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); return 0; } /* * non-sequential insert (at possibly middle page) */ /* * update previous pointer of old next/right page of <sp> */ if (nextbn != 0) { XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) { XT_PUTPAGE(rmp); goto clean_up; } BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the next page; * * action:sibling pointer update; */ if (!test_cflag(COMMIT_Nolink, ip)) tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); p->header.prev = cpu_to_le64(rbn); /* sibling page may have been updated previously, or * it may be updated later; */ XT_PUTPAGE(mp); } /* * split the data between the split and new/right pages */ maxentry = le16_to_cpu(sp->header.maxentry); middle = maxentry >> 1; righthalf = maxentry - middle; /* * skip index in old split/left page - insert into left page: */ if (skip <= middle) { /* move right half of split page to the new right page */ memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], righthalf << L2XTSLOTSIZE); /* shift right tail of left half to make room for new entry */ if (skip < middle) memmove(&sp->xad[skip + 1], &sp->xad[skip], (middle - skip) << L2XTSLOTSIZE); /* insert new entry */ xad = &sp->xad[skip]; XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); /* update page header */ sp->header.nextindex = cpu_to_le16(middle + 1); if (!test_cflag(COMMIT_Nolink, ip)) { sxtlck->lwm.offset = (sxtlck->lwm.offset) ? min(skip, (int)sxtlck->lwm.offset) : skip; } rp->header.nextindex = cpu_to_le16(XTENTRYSTART + righthalf); } /* * skip index in new right page - insert into right page: */ else { /* move left head of right half to right page */ n = skip - middle; memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], n << L2XTSLOTSIZE); /* insert new entry */ n += XTENTRYSTART; xad = &rp->xad[n]; XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); /* move right tail of right half to right page */ if (skip < maxentry) memmove(&rp->xad[n + 1], &sp->xad[skip], (maxentry - skip) << L2XTSLOTSIZE); /* update page header */ sp->header.nextindex = cpu_to_le16(middle); if (!test_cflag(COMMIT_Nolink, ip)) { sxtlck->lwm.offset = (sxtlck->lwm.offset) ? min(middle, (int)sxtlck->lwm.offset) : middle; } rp->header.nextindex = cpu_to_le16(XTENTRYSTART + righthalf + 1); } if (!test_cflag(COMMIT_Nolink, ip)) { sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - sxtlck->lwm.offset; /* rxtlck->lwm.offset = XTENTRYSTART; */ rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - XTENTRYSTART; } *rmpp = rmp; *rbnp = rbn; jfs_info("xtSplitPage: sp:0x%p rp:0x%p", sp, rp); return rc; clean_up: /* Rollback quota allocation. */ if (quota_allocation) dquot_free_block(ip, quota_allocation); return (rc); } /* * xtSplitRoot() * * function: * split the full root page into original/root/split page and new * right page * i.e., root remains fixed in tree anchor (inode) and the root is * copied to a single new right child page since root page << * non-root page, and the split root page contains a single entry * for the new right child page. * * parameter: * int tid, * struct inode *ip, * struct xtsplit *split, * struct metapage **rmpp) * * return: * Pointer to page in which to insert or NULL on error. */ static int xtSplitRoot(tid_t tid, struct inode *ip, struct xtsplit * split, struct metapage ** rmpp) { xtpage_t *sp; struct metapage *rmp; xtpage_t *rp; s64 rbn; int skip, nextindex; xad_t *xad; pxd_t *pxd; struct pxdlist *pxdlist; struct tlock *tlck; struct xtlock *xtlck; int rc; sp = &JFS_IP(ip)->i_xtroot; INCREMENT(xtStat.split); /* * allocate a single (right) child page */ pxdlist = split->pxdlist; pxd = &pxdlist->pxd[pxdlist->npxd]; pxdlist->npxd++; rbn = addressPXD(pxd); rmp = get_metapage(ip, rbn, PSIZE, 1); if (rmp == NULL) return -EIO; /* Allocate blocks to quota. */ rc = dquot_alloc_block(ip, lengthPXD(pxd)); if (rc) { release_metapage(rmp); return rc; } jfs_info("xtSplitRoot: ip:0x%p rmp:0x%p", ip, rmp); /* * acquire a transaction lock on the new right page; * * action: new page; */ BT_MARK_DIRTY(rmp, ip); rp = (xtpage_t *) rmp->data; rp->header.flag = (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; rp->header.self = *pxd; rp->header.nextindex = cpu_to_le16(XTENTRYSTART); rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE); /* initialize sibling pointers */ rp->header.next = 0; rp->header.prev = 0; /* * copy the in-line root page into new right page extent */ nextindex = le16_to_cpu(sp->header.maxentry); memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART], (nextindex - XTENTRYSTART) << L2XTSLOTSIZE); /* * insert the new entry into the new right/child page * (skip index in the new right page will not change) */ skip = split->index; /* if insert into middle, shift right remaining entries */ if (skip != nextindex) memmove(&rp->xad[skip + 1], &rp->xad[skip], (nextindex - skip) * sizeof(xad_t)); xad = &rp->xad[skip]; XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); /* update page header */ rp->header.nextindex = cpu_to_le16(nextindex + 1); if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); xtlck = (struct xtlock *) & tlck->lock; xtlck->lwm.offset = XTENTRYSTART; xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - XTENTRYSTART; } /* * reset the root * * init root with the single entry for the new right page * set the 1st entry offset to 0, which force the left-most key * at any level of the tree to be less than any search key. */ /* * acquire a transaction lock on the root page (in-memory inode); * * action: root split; */ BT_MARK_DIRTY(split->mp, ip); xad = &sp->xad[XTENTRYSTART]; XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn); /* update page header of root */ sp->header.flag &= ~BT_LEAF; sp->header.flag |= BT_INTERNAL; sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; xtlck->lwm.offset = XTENTRYSTART; xtlck->lwm.length = 1; } *rmpp = rmp; jfs_info("xtSplitRoot: sp:0x%p rp:0x%p", sp, rp); return 0; } /* * xtExtend() * * function: extend in-place; * * note: existing extent may or may not have been committed. * caller is responsible for pager buffer cache update, and * working block allocation map update; * update pmap: alloc whole extended extent; */ int xtExtend(tid_t tid, /* transaction id */ struct inode *ip, s64 xoff, /* delta extent offset */ s32 xlen, /* delta extent length */ int flag) { int rc = 0; int cmp; struct metapage *mp; /* meta-page buffer */ xtpage_t *p; /* base B+-tree index page */ s64 bn; int index, nextindex, len; struct btstack btstack; /* traverse stack */ struct xtsplit split; /* split information */ xad_t *xad; s64 xaddr; struct tlock *tlck; struct xtlock *xtlck = NULL; jfs_info("xtExtend: nxoff:0x%lx nxlen:0x%x", (ulong) xoff, xlen); /* there must exist extent to be extended */ if ((rc = xtSearch(ip, xoff - 1, NULL, &cmp, &btstack, XT_INSERT))) return rc; /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); if (cmp != 0) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "xtSearch did not find extent\n"); return -EIO; } /* extension must be contiguous */ xad = &p->xad[index]; if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "extension is not contiguous\n"); return -EIO; } /* * acquire a transaction lock on the leaf page; * * action: xad insertion/extension; */ BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; } /* extend will overflow extent ? */ xlen = lengthXAD(xad) + xlen; if ((len = xlen - MAXXLEN) <= 0) goto extendOld; /* * extent overflow: insert entry for new extent */ //insertNew: xoff = offsetXAD(xad) + MAXXLEN; xaddr = addressXAD(xad) + MAXXLEN; nextindex = le16_to_cpu(p->header.nextindex); /* * if the leaf page is full, insert the new entry and * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ if (nextindex == le16_to_cpu(p->header.maxentry)) { /* xtSpliUp() unpins leaf pages */ split.mp = mp; split.index = index + 1; split.flag = XAD_NEW; split.off = xoff; /* split offset */ split.len = len; split.addr = xaddr; split.pxdlist = NULL; if ((rc = xtSplitUp(tid, ip, &split, &btstack))) return rc; /* get back old page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * if leaf root has been split, original root has been * copied to new child page, i.e., original entry now * resides on the new child page; */ if (p->header.flag & BT_INTERNAL) { ASSERT(p->header.nextindex == cpu_to_le16(XTENTRYSTART + 1)); xad = &p->xad[XTENTRYSTART]; bn = addressXAD(xad); XT_PUTPAGE(mp); /* get new child page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); xtlck = (struct xtlock *) & tlck->lock; } } } /* * insert the new entry into the leaf page */ else { /* insert the new entry: mark the entry NEW */ xad = &p->xad[index + 1]; XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); /* advance next available entry index */ le16_add_cpu(&p->header.nextindex, 1); } /* get back old entry */ xad = &p->xad[index]; xlen = MAXXLEN; /* * extend old extent */ extendOld: XADlength(xad, xlen); if (!(xad->flag & XAD_NEW)) xad->flag |= XAD_EXTENDED; if (!test_cflag(COMMIT_Nolink, ip)) { xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index, (int)xtlck->lwm.offset) : index; xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; } /* unpin the leaf page */ XT_PUTPAGE(mp); return rc; } #ifdef _NOTYET /* * xtTailgate() * * function: split existing 'tail' extent * (split offset >= start offset of tail extent), and * relocate and extend the split tail half; * * note: existing extent may or may not have been committed. * caller is responsible for pager buffer cache update, and * working block allocation map update; * update pmap: free old split tail extent, alloc new extent; */ int xtTailgate(tid_t tid, /* transaction id */ struct inode *ip, s64 xoff, /* split/new extent offset */ s32 xlen, /* new extent length */ s64 xaddr, /* new extent address */ int flag) { int rc = 0; int cmp; struct metapage *mp; /* meta-page buffer */ xtpage_t *p; /* base B+-tree index page */ s64 bn; int index, nextindex, llen, rlen; struct btstack btstack; /* traverse stack */ struct xtsplit split; /* split information */ xad_t *xad; struct tlock *tlck; struct xtlock *xtlck = 0; struct tlock *mtlck; struct maplock *pxdlock; /* printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", (ulong)xoff, xlen, (ulong)xaddr); */ /* there must exist extent to be tailgated */ if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, XT_INSERT))) return rc; /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); if (cmp != 0) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "couldn't find extent\n"); return -EIO; } /* entry found must be last entry */ nextindex = le16_to_cpu(p->header.nextindex); if (index != nextindex - 1) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "the entry found is not the last entry\n"); return -EIO; } BT_MARK_DIRTY(mp, ip); /* * acquire tlock of the leaf page containing original entry */ if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; } /* completely replace extent ? */ xad = &p->xad[index]; /* printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); */ if ((llen = xoff - offsetXAD(xad)) == 0) goto updateOld; /* * partially replace extent: insert entry for new extent */ //insertNew: /* * if the leaf page is full, insert the new entry and * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ if (nextindex == le16_to_cpu(p->header.maxentry)) { /* xtSpliUp() unpins leaf pages */ split.mp = mp; split.index = index + 1; split.flag = XAD_NEW; split.off = xoff; /* split offset */ split.len = xlen; split.addr = xaddr; split.pxdlist = NULL; if ((rc = xtSplitUp(tid, ip, &split, &btstack))) return rc; /* get back old page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * if leaf root has been split, original root has been * copied to new child page, i.e., original entry now * resides on the new child page; */ if (p->header.flag & BT_INTERNAL) { ASSERT(p->header.nextindex == cpu_to_le16(XTENTRYSTART + 1)); xad = &p->xad[XTENTRYSTART]; bn = addressXAD(xad); XT_PUTPAGE(mp); /* get new child page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); xtlck = (struct xtlock *) & tlck->lock; } } } /* * insert the new entry into the leaf page */ else { /* insert the new entry: mark the entry NEW */ xad = &p->xad[index + 1]; XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); /* advance next available entry index */ le16_add_cpu(&p->header.nextindex, 1); } /* get back old XAD */ xad = &p->xad[index]; /* * truncate/relocate old extent at split offset */ updateOld: /* update dmap for old/committed/truncated extent */ rlen = lengthXAD(xad) - llen; if (!(xad->flag & XAD_NEW)) { /* free from PWMAP at commit */ if (!test_cflag(COMMIT_Nolink, ip)) { mtlck = txMaplock(tid, ip, tlckMAP); pxdlock = (struct maplock *) & mtlck->lock; pxdlock->flag = mlckFREEPXD; PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); PXDlength(&pxdlock->pxd, rlen); pxdlock->index = 1; } } else /* free from WMAP */ dbFree(ip, addressXAD(xad) + llen, (s64) rlen); if (llen) /* truncate */ XADlength(xad, llen); else /* replace */ XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); if (!test_cflag(COMMIT_Nolink, ip)) { xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index, (int)xtlck->lwm.offset) : index; xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; } /* unpin the leaf page */ XT_PUTPAGE(mp); return rc; } #endif /* _NOTYET */ /* * xtUpdate() * * function: update XAD; * * update extent for allocated_but_not_recorded or * compressed extent; * * parameter: * nxad - new XAD; * logical extent of the specified XAD must be completely * contained by an existing XAD; */ int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) { /* new XAD */ int rc = 0; int cmp; struct metapage *mp; /* meta-page buffer */ xtpage_t *p; /* base B+-tree index page */ s64 bn; int index0, index, newindex, nextindex; struct btstack btstack; /* traverse stack */ struct xtsplit split; /* split information */ xad_t *xad, *lxad, *rxad; int xflag; s64 nxoff, xoff; int nxlen, xlen, lxlen, rxlen; s64 nxaddr, xaddr; struct tlock *tlck; struct xtlock *xtlck = NULL; int newpage = 0; /* there must exist extent to be tailgated */ nxoff = offsetXAD(nxad); nxlen = lengthXAD(nxad); nxaddr = addressXAD(nxad); if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT))) return rc; /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); if (cmp != 0) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "Could not find extent\n"); return -EIO; } BT_MARK_DIRTY(mp, ip); /* * acquire tlock of the leaf page containing original entry */ if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; } xad = &p->xad[index0]; xflag = xad->flag; xoff = offsetXAD(xad); xlen = lengthXAD(xad); xaddr = addressXAD(xad); /* nXAD must be completely contained within XAD */ if ((xoff > nxoff) || (nxoff + nxlen > xoff + xlen)) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "nXAD in not completely contained within XAD\n"); return -EIO; } index = index0; newindex = index + 1; nextindex = le16_to_cpu(p->header.nextindex); #ifdef _JFS_WIP_NOCOALESCE if (xoff < nxoff) goto updateRight; /* * replace XAD with nXAD */ replace: /* (nxoff == xoff) */ if (nxlen == xlen) { /* replace XAD with nXAD:recorded */ *xad = *nxad; xad->flag = xflag & ~XAD_NOTRECORDED; goto out; } else /* (nxlen < xlen) */ goto updateLeft; #endif /* _JFS_WIP_NOCOALESCE */ /* #ifdef _JFS_WIP_COALESCE */ if (xoff < nxoff) goto coalesceRight; /* * coalesce with left XAD */ //coalesceLeft: /* (xoff == nxoff) */ /* is XAD first entry of page ? */ if (index == XTENTRYSTART) goto replace; /* is nXAD logically and physically contiguous with lXAD ? */ lxad = &p->xad[index - 1]; lxlen = lengthXAD(lxad); if (!(lxad->flag & XAD_NOTRECORDED) && (nxoff == offsetXAD(lxad) + lxlen) && (nxaddr == addressXAD(lxad) + lxlen) && (lxlen + nxlen < MAXXLEN)) { /* extend right lXAD */ index0 = index - 1; XADlength(lxad, lxlen + nxlen); /* If we just merged two extents together, need to make sure the * right extent gets logged. If the left one is marked XAD_NEW, * then we know it will be logged. Otherwise, mark as * XAD_EXTENDED */ if (!(lxad->flag & XAD_NEW)) lxad->flag |= XAD_EXTENDED; if (xlen > nxlen) { /* truncate XAD */ XADoffset(xad, xoff + nxlen); XADlength(xad, xlen - nxlen); XADaddress(xad, xaddr + nxlen); goto out; } else { /* (xlen == nxlen) */ /* remove XAD */ if (index < nextindex - 1) memmove(&p->xad[index], &p->xad[index + 1], (nextindex - index - 1) << L2XTSLOTSIZE); p->header.nextindex = cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1); index = index0; newindex = index + 1; nextindex = le16_to_cpu(p->header.nextindex); xoff = nxoff = offsetXAD(lxad); xlen = nxlen = lxlen + nxlen; xaddr = nxaddr = addressXAD(lxad); goto coalesceRight; } } /* * replace XAD with nXAD */ replace: /* (nxoff == xoff) */ if (nxlen == xlen) { /* replace XAD with nXAD:recorded */ *xad = *nxad; xad->flag = xflag & ~XAD_NOTRECORDED; goto coalesceRight; } else /* (nxlen < xlen) */ goto updateLeft; /* * coalesce with right XAD */ coalesceRight: /* (xoff <= nxoff) */ /* is XAD last entry of page ? */ if (newindex == nextindex) { if (xoff == nxoff) goto out; goto updateRight; } /* is nXAD logically and physically contiguous with rXAD ? */ rxad = &p->xad[index + 1]; rxlen = lengthXAD(rxad); if (!(rxad->flag & XAD_NOTRECORDED) && (nxoff + nxlen == offsetXAD(rxad)) && (nxaddr + nxlen == addressXAD(rxad)) && (rxlen + nxlen < MAXXLEN)) { /* extend left rXAD */ XADoffset(rxad, nxoff); XADlength(rxad, rxlen + nxlen); XADaddress(rxad, nxaddr); /* If we just merged two extents together, need to make sure * the left extent gets logged. If the right one is marked * XAD_NEW, then we know it will be logged. Otherwise, mark as * XAD_EXTENDED */ if (!(rxad->flag & XAD_NEW)) rxad->flag |= XAD_EXTENDED; if (xlen > nxlen) /* truncate XAD */ XADlength(xad, xlen - nxlen); else { /* (xlen == nxlen) */ /* remove XAD */ memmove(&p->xad[index], &p->xad[index + 1], (nextindex - index - 1) << L2XTSLOTSIZE); p->header.nextindex = cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1); } goto out; } else if (xoff == nxoff) goto out; if (xoff >= nxoff) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "xoff >= nxoff\n"); return -EIO; } /* #endif _JFS_WIP_COALESCE */ /* * split XAD into (lXAD, nXAD): * * |---nXAD---> * --|----------XAD----------|-- * |-lXAD-| */ updateRight: /* (xoff < nxoff) */ /* truncate old XAD as lXAD:not_recorded */ xad = &p->xad[index]; XADlength(xad, nxoff - xoff); /* insert nXAD:recorded */ if (nextindex == le16_to_cpu(p->header.maxentry)) { /* xtSpliUp() unpins leaf pages */ split.mp = mp; split.index = newindex; split.flag = xflag & ~XAD_NOTRECORDED; split.off = nxoff; split.len = nxlen; split.addr = nxaddr; split.pxdlist = NULL; if ((rc = xtSplitUp(tid, ip, &split, &btstack))) return rc; /* get back old page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * if leaf root has been split, original root has been * copied to new child page, i.e., original entry now * resides on the new child page; */ if (p->header.flag & BT_INTERNAL) { ASSERT(p->header.nextindex == cpu_to_le16(XTENTRYSTART + 1)); xad = &p->xad[XTENTRYSTART]; bn = addressXAD(xad); XT_PUTPAGE(mp); /* get new child page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); xtlck = (struct xtlock *) & tlck->lock; } } else { /* is nXAD on new page ? */ if (newindex > (le16_to_cpu(p->header.maxentry) >> 1)) { newindex = newindex - le16_to_cpu(p->header.nextindex) + XTENTRYSTART; newpage = 1; } } } else { /* if insert into middle, shift right remaining entries */ if (newindex < nextindex) memmove(&p->xad[newindex + 1], &p->xad[newindex], (nextindex - newindex) << L2XTSLOTSIZE); /* insert the entry */ xad = &p->xad[newindex]; *xad = *nxad; xad->flag = xflag & ~XAD_NOTRECORDED; /* advance next available entry index. */ p->header.nextindex = cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); } /* * does nXAD force 3-way split ? * * |---nXAD--->| * --|----------XAD-------------|-- * |-lXAD-| |-rXAD -| */ if (nxoff + nxlen == xoff + xlen) goto out; /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */ if (newpage) { /* close out old page */ if (!test_cflag(COMMIT_Nolink, ip)) { xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index0, (int)xtlck->lwm.offset) : index0; xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; } bn = le64_to_cpu(p->header.next); XT_PUTPAGE(mp); /* get new right page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; } index0 = index = newindex; } else index++; newindex = index + 1; nextindex = le16_to_cpu(p->header.nextindex); xlen = xlen - (nxoff - xoff); xoff = nxoff; xaddr = nxaddr; /* recompute split pages */ if (nextindex == le16_to_cpu(p->header.maxentry)) { XT_PUTPAGE(mp); if ((rc = xtSearch(ip, nxoff, NULL, &cmp, &btstack, XT_INSERT))) return rc; /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); if (cmp != 0) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "xtSearch failed\n"); return -EIO; } if (index0 != index) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "unexpected value of index\n"); return -EIO; } } /* * split XAD into (nXAD, rXAD) * * ---nXAD---| * --|----------XAD----------|-- * |-rXAD-| */ updateLeft: /* (nxoff == xoff) && (nxlen < xlen) */ /* update old XAD with nXAD:recorded */ xad = &p->xad[index]; *xad = *nxad; xad->flag = xflag & ~XAD_NOTRECORDED; /* insert rXAD:not_recorded */ xoff = xoff + nxlen; xlen = xlen - nxlen; xaddr = xaddr + nxlen; if (nextindex == le16_to_cpu(p->header.maxentry)) { /* printf("xtUpdate.updateLeft.split p:0x%p\n", p); */ /* xtSpliUp() unpins leaf pages */ split.mp = mp; split.index = newindex; split.flag = xflag; split.off = xoff; split.len = xlen; split.addr = xaddr; split.pxdlist = NULL; if ((rc = xtSplitUp(tid, ip, &split, &btstack))) return rc; /* get back old page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* * if leaf root has been split, original root has been * copied to new child page, i.e., original entry now * resides on the new child page; */ if (p->header.flag & BT_INTERNAL) { ASSERT(p->header.nextindex == cpu_to_le16(XTENTRYSTART + 1)); xad = &p->xad[XTENTRYSTART]; bn = addressXAD(xad); XT_PUTPAGE(mp); /* get new child page */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; BT_MARK_DIRTY(mp, ip); if (!test_cflag(COMMIT_Nolink, ip)) { tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); xtlck = (struct xtlock *) & tlck->lock; } } } else { /* if insert into middle, shift right remaining entries */ if (newindex < nextindex) memmove(&p->xad[newindex + 1], &p->xad[newindex], (nextindex - newindex) << L2XTSLOTSIZE); /* insert the entry */ xad = &p->xad[newindex]; XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); /* advance next available entry index. */ p->header.nextindex = cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); } out: if (!test_cflag(COMMIT_Nolink, ip)) { xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index0, (int)xtlck->lwm.offset) : index0; xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; } /* unpin the leaf page */ XT_PUTPAGE(mp); return rc; } /* * xtAppend() * * function: grow in append mode from contiguous region specified ; * * parameter: * tid - transaction id; * ip - file object; * xflag - extent flag: * xoff - extent offset; * maxblocks - max extent length; * xlen - extent length (in/out); * xaddrp - extent address pointer (in/out): * flag - * * return: */ int xtAppend(tid_t tid, /* transaction id */ struct inode *ip, int xflag, s64 xoff, s32 maxblocks, s32 * xlenp, /* (in/out) */ s64 * xaddrp, /* (in/out) */ int flag) { int rc = 0; struct metapage *mp; /* meta-page buffer */ xtpage_t *p; /* base B+-tree index page */ s64 bn, xaddr; int index, nextindex; struct btstack btstack; /* traverse stack */ struct xtsplit split; /* split information */ xad_t *xad; int cmp; struct tlock *tlck; struct xtlock *xtlck; int nsplit, nblocks, xlen; struct pxdlist pxdlist; pxd_t *pxd; s64 next; xaddr = *xaddrp; xlen = *xlenp; jfs_info("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx", (ulong) xoff, maxblocks, xlen, (ulong) xaddr); /* * search for the entry location at which to insert: * * xtFastSearch() and xtSearch() both returns (leaf page * pinned, index at which to insert). * n.b. xtSearch() may return index of maxentry of * the full page. */ if ((rc = xtSearch(ip, xoff, &next, &cmp, &btstack, XT_INSERT))) return rc; /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); if (cmp == 0) { rc = -EEXIST; goto out; } if (next) xlen = min(xlen, (int)(next - xoff)); //insert: /* * insert entry for new extent */ xflag |= XAD_NEW; /* * if the leaf page is full, split the page and * propagate up the router entry for the new page from split * * The xtSplitUp() will insert the entry and unpin the leaf page. */ nextindex = le16_to_cpu(p->header.nextindex); if (nextindex < le16_to_cpu(p->header.maxentry)) goto insertLeaf; /* * allocate new index blocks to cover index page split(s) */ nsplit = btstack.nsplit; split.pxdlist = &pxdlist; pxdlist.maxnpxd = pxdlist.npxd = 0; pxd = &pxdlist.pxd[0]; nblocks = JFS_SBI(ip->i_sb)->nbperpage; for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) { PXDaddress(pxd, xaddr); PXDlength(pxd, nblocks); pxdlist.maxnpxd++; continue; } /* undo allocation */ goto out; } xlen = min(xlen, maxblocks); /* * allocate data extent requested */ if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) goto out; split.mp = mp; split.index = index; split.flag = xflag; split.off = xoff; split.len = xlen; split.addr = xaddr; if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { /* undo data extent allocation */ dbFree(ip, *xaddrp, (s64) * xlenp); return rc; } *xaddrp = xaddr; *xlenp = xlen; return 0; /* * insert the new entry into the leaf page */ insertLeaf: /* * allocate data extent requested */ if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) goto out; BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page; * * action: xad insertion/extension; */ tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; /* insert the new entry: mark the entry NEW */ xad = &p->xad[index]; XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); /* advance next available entry index */ le16_add_cpu(&p->header.nextindex, 1); xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index; xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; *xaddrp = xaddr; *xlenp = xlen; out: /* unpin the leaf page */ XT_PUTPAGE(mp); return rc; } #ifdef _STILL_TO_PORT /* - TBD for defragmentaion/reorganization - * * xtDelete() * * function: * delete the entry with the specified key. * * N.B.: whole extent of the entry is assumed to be deleted. * * parameter: * * return: * ENOENT: if the entry is not found. * * exception: */ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) { int rc = 0; struct btstack btstack; int cmp; s64 bn; struct metapage *mp; xtpage_t *p; int index, nextindex; struct tlock *tlck; struct xtlock *xtlck; /* * find the matching entry; xtSearch() pins the page */ if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) return rc; XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); if (cmp) { /* unpin the leaf page */ XT_PUTPAGE(mp); return -ENOENT; } /* * delete the entry from the leaf page */ nextindex = le16_to_cpu(p->header.nextindex); le16_add_cpu(&p->header.nextindex, -1); /* * if the leaf page bocome empty, free the page */ if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) return (xtDeleteUp(tid, ip, mp, p, &btstack)); BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page; * * action:xad deletion; */ tlck = txLock(tid, ip, mp, tlckXTREE); xtlck = (struct xtlock *) & tlck->lock; xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; /* if delete from middle, shift left/compact the remaining entries */ if (index < nextindex - 1) memmove(&p->xad[index], &p->xad[index + 1], (nextindex - index - 1) * sizeof(xad_t)); XT_PUTPAGE(mp); return 0; } /* - TBD for defragmentaion/reorganization - * * xtDeleteUp() * * function: * free empty pages as propagating deletion up the tree * * parameter: * * return: */ static int xtDeleteUp(tid_t tid, struct inode *ip, struct metapage * fmp, xtpage_t * fp, struct btstack * btstack) { int rc = 0; struct metapage *mp; xtpage_t *p; int index, nextindex; s64 xaddr; int xlen; struct btframe *parent; struct tlock *tlck; struct xtlock *xtlck; /* * keep root leaf page which has become empty */ if (fp->header.flag & BT_ROOT) { /* keep the root page */ fp->header.flag &= ~BT_INTERNAL; fp->header.flag |= BT_LEAF; fp->header.nextindex = cpu_to_le16(XTENTRYSTART); /* XT_PUTPAGE(fmp); */ return 0; } /* * free non-root leaf page */ if ((rc = xtRelink(tid, ip, fp))) { XT_PUTPAGE(fmp); return rc; } xaddr = addressPXD(&fp->header.self); xlen = lengthPXD(&fp->header.self); /* free the page extent */ dbFree(ip, xaddr, (s64) xlen); /* free the buffer page */ discard_metapage(fmp); /* * propagate page deletion up the index tree * * If the delete from the parent page makes it empty, * continue all the way up the tree. * stop if the root page is reached (which is never deleted) or * if the entry deletion does not empty the page. */ while ((parent = BT_POP(btstack)) != NULL) { /* get/pin the parent page <sp> */ XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); if (rc) return rc; index = parent->index; /* delete the entry for the freed child page from parent. */ nextindex = le16_to_cpu(p->header.nextindex); /* * the parent has the single entry being deleted: * free the parent page which has become empty. */ if (nextindex == 1) { if (p->header.flag & BT_ROOT) { /* keep the root page */ p->header.flag &= ~BT_INTERNAL; p->header.flag |= BT_LEAF; p->header.nextindex = cpu_to_le16(XTENTRYSTART); /* XT_PUTPAGE(mp); */ break; } else { /* free the parent page */ if ((rc = xtRelink(tid, ip, p))) return rc; xaddr = addressPXD(&p->header.self); /* free the page extent */ dbFree(ip, xaddr, (s64) JFS_SBI(ip->i_sb)->nbperpage); /* unpin/free the buffer page */ discard_metapage(mp); /* propagate up */ continue; } } /* * the parent has other entries remaining: * delete the router entry from the parent page. */ else { BT_MARK_DIRTY(mp, ip); /* * acquire a transaction lock on the leaf page; * * action:xad deletion; */ tlck = txLock(tid, ip, mp, tlckXTREE); xtlck = (struct xtlock *) & tlck->lock; xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index, xtlck->lwm. offset) : index; /* if delete from middle, * shift left/compact the remaining entries in the page */ if (index < nextindex - 1) memmove(&p->xad[index], &p->xad[index + 1], (nextindex - index - 1) << L2XTSLOTSIZE); le16_add_cpu(&p->header.nextindex, -1); jfs_info("xtDeleteUp(entry): 0x%lx[%d]", (ulong) parent->bn, index); } /* unpin the parent page */ XT_PUTPAGE(mp); /* exit propagation up */ break; } return 0; } /* * NAME: xtRelocate() * * FUNCTION: relocate xtpage or data extent of regular file; * This function is mainly used by defragfs utility. * * NOTE: This routine does not have the logic to handle * uncommitted allocated extent. The caller should call * txCommit() to commit all the allocation before call * this routine. */ int xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ s64 nxaddr, /* new xaddr */ int xtype) { /* extent type: XTPAGE or DATAEXT */ int rc = 0; struct tblock *tblk; struct tlock *tlck; struct xtlock *xtlck; struct metapage *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ xad_t *xad; pxd_t *pxd; s64 xoff, xsize; int xlen; s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; cbuf_t *cp; s64 offset, nbytes, nbrd, pno; int nb, npages, nblks; s64 bn; int cmp; int index; struct pxd_lock *pxdlock; struct btstack btstack; /* traverse stack */ xtype = xtype & EXTENT_TYPE; xoff = offsetXAD(oxad); oxaddr = addressXAD(oxad); xlen = lengthXAD(oxad); /* validate extent offset */ offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; if (offset >= ip->i_size) return -ESTALE; /* stale extent */ jfs_info("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx", xtype, (ulong) xoff, xlen, (ulong) oxaddr, (ulong) nxaddr); /* * 1. get and validate the parent xtpage/xad entry * covering the source extent to be relocated; */ if (xtype == DATAEXT) { /* search in leaf entry */ rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); if (rc) return rc; /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); if (cmp) { XT_PUTPAGE(pmp); return -ESTALE; } /* validate for exact match with a single entry */ xad = &pp->xad[index]; if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { XT_PUTPAGE(pmp); return -ESTALE; } } else { /* (xtype == XTPAGE) */ /* search in internal entry */ rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); if (rc) return rc; /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); if (cmp) { XT_PUTPAGE(pmp); return -ESTALE; } /* xtSearchNode() validated for exact match with a single entry */ xad = &pp->xad[index]; } jfs_info("xtRelocate: parent xad entry validated."); /* * 2. relocate the extent */ if (xtype == DATAEXT) { /* if the extent is allocated-but-not-recorded * there is no real data to be moved in this extent, */ if (xad->flag & XAD_NOTRECORDED) goto out; else /* release xtpage for cmRead()/xtLookup() */ XT_PUTPAGE(pmp); /* * cmRelocate() * * copy target data pages to be relocated; * * data extent must start at page boundary and * multiple of page size (except the last data extent); * read in each page of the source data extent into cbuf, * update the cbuf extent descriptor of the page to be * homeward bound to new dst data extent * copy the data from the old extent to new extent. * copy is essential for compressed files to avoid problems * that can arise if there was a change in compression * algorithms. * it is a good strategy because it may disrupt cache * policy to keep the pages in memory afterwards. */ offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; assert((offset & CM_OFFSET) == 0); nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; pno = offset >> CM_L2BSIZE; npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; /* npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - (offset >> CM_L2BSIZE) + 1; */ sxaddr = oxaddr; dxaddr = nxaddr; /* process the request one cache buffer at a time */ for (nbrd = 0; nbrd < nbytes; nbrd += nb, offset += nb, pno++, npages--) { /* compute page size */ nb = min(nbytes - nbrd, CM_BSIZE); /* get the cache buffer of the page */ if (rc = cmRead(ip, offset, npages, &cp)) break; assert(addressPXD(&cp->cm_pxd) == sxaddr); assert(!cp->cm_modified); /* bind buffer with the new extent address */ nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; cmSetXD(ip, cp, pno, dxaddr, nblks); /* release the cbuf, mark it as modified */ cmPut(cp, true); dxaddr += nblks; sxaddr += nblks; } /* get back parent page */ if ((rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0))) return rc; XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); jfs_info("xtRelocate: target data extent relocated."); } else { /* (xtype == XTPAGE) */ /* * read in the target xtpage from the source extent; */ XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); if (rc) { XT_PUTPAGE(pmp); return rc; } /* * read in sibling pages if any to update sibling pointers; */ rmp = NULL; if (p->header.next) { nextbn = le64_to_cpu(p->header.next); XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); if (rc) { XT_PUTPAGE(pmp); XT_PUTPAGE(mp); return (rc); } } lmp = NULL; if (p->header.prev) { prevbn = le64_to_cpu(p->header.prev); XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); if (rc) { XT_PUTPAGE(pmp); XT_PUTPAGE(mp); if (rmp) XT_PUTPAGE(rmp); return (rc); } } /* at this point, all xtpages to be updated are in memory */ /* * update sibling pointers of sibling xtpages if any; */ if (lmp) { BT_MARK_DIRTY(lmp, ip); tlck = txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); lp->header.next = cpu_to_le64(nxaddr); XT_PUTPAGE(lmp); } if (rmp) { BT_MARK_DIRTY(rmp, ip); tlck = txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); rp->header.prev = cpu_to_le64(nxaddr); XT_PUTPAGE(rmp); } /* * update the target xtpage to be relocated * * update the self address of the target page * and write to destination extent; * redo image covers the whole xtpage since it is new page * to the destination extent; * update of bmap for the free of source extent * of the target xtpage itself: * update of bmap for the allocation of destination extent * of the target xtpage itself: * update of bmap for the extents covered by xad entries in * the target xtpage is not necessary since they are not * updated; * if not committed before this relocation, * target page may contain XAD_NEW entries which must * be scanned for bmap update (logredo() always * scan xtpage REDOPAGE image for bmap update); * if committed before this relocation (tlckRELOCATE), * scan may be skipped by commit() and logredo(); */ BT_MARK_DIRTY(mp, ip); /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); xtlck = (struct xtlock *) & tlck->lock; /* update the self address in the xtpage header */ pxd = &p->header.self; PXDaddress(pxd, nxaddr); /* linelock for the after image of the whole page */ xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; /* update the buffer extent descriptor of target xtpage */ xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; bmSetXD(mp, nxaddr, xsize); /* unpin the target page to new homeward bound */ XT_PUTPAGE(mp); jfs_info("xtRelocate: target xtpage relocated."); } /* * 3. acquire maplock for the source extent to be freed; * * acquire a maplock saving the src relocated extent address; * to free of the extent at commit time; */ out: /* if DATAEXT relocation, write a LOG_UPDATEMAP record for * free PXD of the source data extent (logredo() will update * bmap for free of source data extent), and update bmap for * free of the source data extent; */ if (xtype == DATAEXT) tlck = txMaplock(tid, ip, tlckMAP); /* if XTPAGE relocation, write a LOG_NOREDOPAGE record * for the source xtpage (logredo() will init NoRedoPage * filter and will also update bmap for free of the source * xtpage), and update bmap for free of the source xtpage; * N.B. We use tlckMAP instead of tlkcXTREE because there * is no buffer associated with this lock since the buffer * has been redirected to the target location. */ else /* (xtype == XTPAGE) */ tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; PXDaddress(&pxdlock->pxd, oxaddr); PXDlength(&pxdlock->pxd, xlen); pxdlock->index = 1; /* * 4. update the parent xad entry for relocation; * * acquire tlck for the parent entry with XAD_NEW as entry * update which will write LOG_REDOPAGE and update bmap for * allocation of XAD_NEW destination extent; */ jfs_info("xtRelocate: update parent xad entry."); BT_MARK_DIRTY(pmp, ip); tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); xtlck = (struct xtlock *) & tlck->lock; /* update the XAD with the new destination extent; */ xad = &pp->xad[index]; xad->flag |= XAD_NEW; XADaddress(xad, nxaddr); xtlck->lwm.offset = min(index, xtlck->lwm.offset); xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - xtlck->lwm.offset; /* unpin the parent xtpage */ XT_PUTPAGE(pmp); return rc; } /* * xtSearchNode() * * function: search for the internal xad entry covering specified extent. * This function is mainly used by defragfs utility. * * parameters: * ip - file object; * xad - extent to find; * cmpp - comparison result: * btstack - traverse stack; * flag - search process flag; * * returns: * btstack contains (bn, index) of search path traversed to the entry. * *cmpp is set to result of comparison with the entry returned. * the page containing the entry is pinned at exit. */ static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ int *cmpp, struct btstack * btstack, int flag) { int rc = 0; s64 xoff, xaddr; int xlen; int cmp = 1; /* init for empty page */ s64 bn; /* block number */ struct metapage *mp; /* meta-page buffer */ xtpage_t *p; /* page */ int base, index, lim; struct btframe *btsp; s64 t64; BT_CLR(btstack); xoff = offsetXAD(xad); xlen = lengthXAD(xad); xaddr = addressXAD(xad); /* * search down tree from root: * * between two consecutive entries of <Ki, Pi> and <Kj, Pj> of * internal page, child page Pi contains entry with k, Ki <= K < Kj. * * if entry with search key K is not found * internal page search find the entry with largest key Ki * less than K which point to the child page to search; * leaf page search find the entry with smallest key Kj * greater than K so that the returned index is the position of * the entry to be shifted right for insertion of new entry. * for empty tree, search key is greater than any key of the tree. * * by convention, root bn = 0. */ for (bn = 0;;) { /* get/pin the page to search */ XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; if (p->header.flag & BT_LEAF) { XT_PUTPAGE(mp); return -ESTALE; } lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; /* * binary search with search key K on the current page */ for (base = XTENTRYSTART; lim; lim >>= 1) { index = base + (lim >> 1); XT_CMP(cmp, xoff, &p->xad[index], t64); if (cmp == 0) { /* * search hit * * verify for exact match; */ if (xaddr == addressXAD(&p->xad[index]) && xoff == offsetXAD(&p->xad[index])) { *cmpp = cmp; /* save search result */ btsp = btstack->top; btsp->bn = bn; btsp->index = index; btsp->mp = mp; return 0; } /* descend/search its child page */ goto next; } if (cmp > 0) { base = index + 1; --lim; } } /* * search miss - non-leaf page: * * base is the smallest index with key (Kj) greater than * search key (K) and may be zero or maxentry index. * if base is non-zero, decrement base by one to get the parent * entry of the child page to search. */ index = base ? base - 1 : base; /* * go down to child page */ next: /* get the child page block number */ bn = addressXAD(&p->xad[index]); /* unpin the parent page */ XT_PUTPAGE(mp); } } /* * xtRelink() * * function: * link around a freed page. * * Parameter: * int tid, * struct inode *ip, * xtpage_t *p) * * returns: */ static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) { int rc = 0; struct metapage *mp; s64 nextbn, prevbn; struct tlock *tlck; nextbn = le64_to_cpu(p->header.next); prevbn = le64_to_cpu(p->header.prev); /* update prev pointer of the next page */ if (nextbn != 0) { XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); if (rc) return rc; /* * acquire a transaction lock on the page; * * action: update prev pointer; */ BT_MARK_DIRTY(mp, ip); tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); /* the page may already have been tlock'd */ p->header.prev = cpu_to_le64(prevbn); XT_PUTPAGE(mp); } /* update next pointer of the previous page */ if (prevbn != 0) { XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); if (rc) return rc; /* * acquire a transaction lock on the page; * * action: update next pointer; */ BT_MARK_DIRTY(mp, ip); tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); /* the page may already have been tlock'd */ p->header.next = le64_to_cpu(nextbn); XT_PUTPAGE(mp); } return 0; } #endif /* _STILL_TO_PORT */ /* * xtInitRoot() * * initialize file root (inline in inode) */ void xtInitRoot(tid_t tid, struct inode *ip) { xtpage_t *p; /* * acquire a transaction lock on the root * * action: */ txLock(tid, ip, (struct metapage *) &JFS_IP(ip)->bxflag, tlckXTREE | tlckNEW); p = &JFS_IP(ip)->i_xtroot; p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; p->header.nextindex = cpu_to_le16(XTENTRYSTART); if (S_ISDIR(ip->i_mode)) p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR); else { p->header.maxentry = cpu_to_le16(XTROOTINITSLOT); ip->i_size = 0; } return; } /* * We can run into a deadlock truncating a file with a large number of * xtree pages (large fragmented file). A robust fix would entail a * reservation system where we would reserve a number of metadata pages * and tlocks which we would be guaranteed without a deadlock. Without * this, a partial fix is to limit number of metadata pages we will lock * in a single transaction. Currently we will truncate the file so that * no more than 50 leaf pages will be locked. The caller of xtTruncate * will be responsible for ensuring that the current transaction gets * committed, and that subsequent transactions are created to truncate * the file further if needed. */ #define MAX_TRUNCATE_LEAVES 50 /* * xtTruncate() * * function: * traverse for truncation logging backward bottom up; * terminate at the last extent entry at the current subtree * root page covering new down size. * truncation may occur within the last extent entry. * * parameter: * int tid, * struct inode *ip, * s64 newsize, * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} * * return: * * note: * PWMAP: * 1. truncate (non-COMMIT_NOLINK file) * by jfs_truncate() or jfs_open(O_TRUNC): * xtree is updated; * 2. truncate index table of directory when last entry removed * map update via tlock at commit time; * PMAP: * Call xtTruncate_pmap instead * WMAP: * 1. remove (free zero link count) on last reference release * (pmap has been freed at commit zero link count); * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): * xtree is updated; * map update directly at truncation time; * * if (DELETE) * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); * else if (TRUNCATE) * must write LOG_NOREDOPAGE for deleted index page; * * pages may already have been tlocked by anonymous transactions * during file growth (i.e., write) before truncation; * * except last truncated entry, deleted entries remains as is * in the page (nextindex is updated) for other use * (e.g., log/update allocation map): this avoid copying the page * info but delay free of pages; * */ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) { int rc = 0; s64 teof; struct metapage *mp; xtpage_t *p; s64 bn; int index, nextindex; xad_t *xad; s64 xoff, xaddr; int xlen, len, freexlen; struct btstack btstack; struct btframe *parent; struct tblock *tblk = NULL; struct tlock *tlck = NULL; struct xtlock *xtlck = NULL; struct xdlistlock xadlock; /* maplock for COMMIT_WMAP */ struct pxd_lock *pxdlock; /* maplock for COMMIT_WMAP */ s64 nfreed; int freed, log; int locked_leaves = 0; /* save object truncation type */ if (tid) { tblk = tid_to_tblock(tid); tblk->xflag |= flag; } nfreed = 0; flag &= COMMIT_MAP; assert(flag != COMMIT_PMAP); if (flag == COMMIT_PWMAP) log = 1; else { log = 0; xadlock.flag = mlckFREEXADLIST; xadlock.index = 1; } /* * if the newsize is not an integral number of pages, * the file between newsize and next page boundary will * be cleared. * if truncating into a file hole, it will cause * a full block to be allocated for the logical block. */ /* * release page blocks of truncated region <teof, eof> * * free the data blocks from the leaf index blocks. * delete the parent index entries corresponding to * the freed child data/index blocks. * free the index blocks themselves which aren't needed * in new sized file. * * index blocks are updated only if the blocks are to be * retained in the new sized file. * if type is PMAP, the data and index pages are NOT * freed, and the data and index blocks are NOT freed * from working map. * (this will allow continued access of data/index of * temporary file (zerolink count file truncated to zero-length)). */ teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >> JFS_SBI(ip->i_sb)->l2bsize; /* clear stack */ BT_CLR(&btstack); /* * start with root * * root resides in the inode */ bn = 0; /* * first access of each page: */ getPage: XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* process entries backward from last index */ index = le16_to_cpu(p->header.nextindex) - 1; /* Since this is the rightmost page at this level, and we may have * already freed a page that was formerly to the right, let's make * sure that the next pointer is zero. */ if (p->header.next) { if (log) /* * Make sure this change to the header is logged. * If we really truncate this leaf, the flag * will be changed to tlckTRUNCATE */ tlck = txLock(tid, ip, mp, tlckXTREE|tlckGROW); BT_MARK_DIRTY(mp, ip); p->header.next = 0; } if (p->header.flag & BT_INTERNAL) goto getChild; /* * leaf page */ freed = 0; /* does region covered by leaf page precede Teof ? */ xad = &p->xad[index]; xoff = offsetXAD(xad); xlen = lengthXAD(xad); if (teof >= xoff + xlen) { XT_PUTPAGE(mp); goto getParent; } /* (re)acquire tlock of the leaf page */ if (log) { if (++locked_leaves > MAX_TRUNCATE_LEAVES) { /* * We need to limit the size of the transaction * to avoid exhausting pagecache & tlocks */ XT_PUTPAGE(mp); newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; goto getParent; } tlck = txLock(tid, ip, mp, tlckXTREE); tlck->type = tlckXTREE | tlckTRUNCATE; xtlck = (struct xtlock *) & tlck->lock; xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; } BT_MARK_DIRTY(mp, ip); /* * scan backward leaf page entries */ for (; index >= XTENTRYSTART; index--) { xad = &p->xad[index]; xoff = offsetXAD(xad); xlen = lengthXAD(xad); xaddr = addressXAD(xad); /* * The "data" for a directory is indexed by the block * device's address space. This metadata must be invalidated * here */ if (S_ISDIR(ip->i_mode) && (teof == 0)) invalidate_xad_metapages(ip, *xad); /* * entry beyond eof: continue scan of current page * xad * ---|---=======-------> * eof */ if (teof < xoff) { nfreed += xlen; continue; } /* * (xoff <= teof): last entry to be deleted from page; * If other entries remain in page: keep and update the page. */ /* * eof == entry_start: delete the entry * xad * -------|=======-------> * eof * */ if (teof == xoff) { nfreed += xlen; if (index == XTENTRYSTART) break; nextindex = index; } /* * eof within the entry: truncate the entry. * xad * -------===|===-------> * eof */ else if (teof < xoff + xlen) { /* update truncated entry */ len = teof - xoff; freexlen = xlen - len; XADlength(xad, len); /* save pxd of truncated extent in tlck */ xaddr += len; if (log) { /* COMMIT_PWMAP */ xtlck->lwm.offset = (xtlck->lwm.offset) ? min(index, (int)xtlck->lwm.offset) : index; xtlck->lwm.length = index + 1 - xtlck->lwm.offset; xtlck->twm.offset = index; pxdlock = (struct pxd_lock *) & xtlck->pxdlock; pxdlock->flag = mlckFREEPXD; PXDaddress(&pxdlock->pxd, xaddr); PXDlength(&pxdlock->pxd, freexlen); } /* free truncated extent */ else { /* COMMIT_WMAP */ pxdlock = (struct pxd_lock *) & xadlock; pxdlock->flag = mlckFREEPXD; PXDaddress(&pxdlock->pxd, xaddr); PXDlength(&pxdlock->pxd, freexlen); txFreeMap(ip, pxdlock, NULL, COMMIT_WMAP); /* reset map lock */ xadlock.flag = mlckFREEXADLIST; } /* current entry is new last entry; */ nextindex = index + 1; nfreed += freexlen; } /* * eof beyond the entry: * xad * -------=======---|---> * eof */ else { /* (xoff + xlen < teof) */ nextindex = index + 1; } if (nextindex < le16_to_cpu(p->header.nextindex)) { if (!log) { /* COMMIT_WAMP */ xadlock.xdlist = &p->xad[nextindex]; xadlock.count = le16_to_cpu(p->header.nextindex) - nextindex; txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); } p->header.nextindex = cpu_to_le16(nextindex); } XT_PUTPAGE(mp); /* assert(freed == 0); */ goto getParent; } /* end scan of leaf page entries */ freed = 1; /* * leaf page become empty: free the page if type != PMAP */ if (log) { /* COMMIT_PWMAP */ /* txCommit() with tlckFREE: * free data extents covered by leaf [XTENTRYSTART:hwm); * invalidate leaf if COMMIT_PWMAP; * if (TRUNCATE), will write LOG_NOREDOPAGE; */ tlck->type = tlckXTREE | tlckFREE; } else { /* COMMIT_WAMP */ /* free data extents covered by leaf */ xadlock.xdlist = &p->xad[XTENTRYSTART]; xadlock.count = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); } if (p->header.flag & BT_ROOT) { p->header.flag &= ~BT_INTERNAL; p->header.flag |= BT_LEAF; p->header.nextindex = cpu_to_le16(XTENTRYSTART); XT_PUTPAGE(mp); /* debug */ goto out; } else { if (log) { /* COMMIT_PWMAP */ /* page will be invalidated at tx completion */ XT_PUTPAGE(mp); } else { /* COMMIT_WMAP */ if (mp->lid) lid_to_tlock(mp->lid)->flag |= tlckFREELOCK; /* invalidate empty leaf page */ discard_metapage(mp); } } /* * the leaf page become empty: delete the parent entry * for the leaf page if the parent page is to be kept * in the new sized file. */ /* * go back up to the parent page */ getParent: /* pop/restore parent entry for the current child page */ if ((parent = BT_POP(&btstack)) == NULL) /* current page must have been root */ goto out; /* get back the parent page */ bn = parent->bn; XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; index = parent->index; /* * child page was not empty: */ if (freed == 0) { /* has any entry deleted from parent ? */ if (index < le16_to_cpu(p->header.nextindex) - 1) { /* (re)acquire tlock on the parent page */ if (log) { /* COMMIT_PWMAP */ /* txCommit() with tlckTRUNCATE: * free child extents covered by parent [); */ tlck = txLock(tid, ip, mp, tlckXTREE); xtlck = (struct xtlock *) & tlck->lock; if (!(tlck->type & tlckTRUNCATE)) { xtlck->hwm.offset = le16_to_cpu(p->header. nextindex) - 1; tlck->type = tlckXTREE | tlckTRUNCATE; } } else { /* COMMIT_WMAP */ /* free child extents covered by parent */ xadlock.xdlist = &p->xad[index + 1]; xadlock.count = le16_to_cpu(p->header.nextindex) - index - 1; txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); } BT_MARK_DIRTY(mp, ip); p->header.nextindex = cpu_to_le16(index + 1); } XT_PUTPAGE(mp); goto getParent; } /* * child page was empty: */ nfreed += lengthXAD(&p->xad[index]); /* * During working map update, child page's tlock must be handled * before parent's. This is because the parent's tlock will cause * the child's disk space to be marked available in the wmap, so * it's important that the child page be released by that time. * * ToDo: tlocks should be on doubly-linked list, so we can * quickly remove it and add it to the end. */ /* * Move parent page's tlock to the end of the tid's tlock list */ if (log && mp->lid && (tblk->last != mp->lid) && lid_to_tlock(mp->lid)->tid) { lid_t lid = mp->lid; struct tlock *prev; tlck = lid_to_tlock(lid); if (tblk->next == lid) tblk->next = tlck->next; else { for (prev = lid_to_tlock(tblk->next); prev->next != lid; prev = lid_to_tlock(prev->next)) { assert(prev->next); } prev->next = tlck->next; } lid_to_tlock(tblk->last)->next = lid; tlck->next = 0; tblk->last = lid; } /* * parent page become empty: free the page */ if (index == XTENTRYSTART) { if (log) { /* COMMIT_PWMAP */ /* txCommit() with tlckFREE: * free child extents covered by parent; * invalidate parent if COMMIT_PWMAP; */ tlck = txLock(tid, ip, mp, tlckXTREE); xtlck = (struct xtlock *) & tlck->lock; xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; tlck->type = tlckXTREE | tlckFREE; } else { /* COMMIT_WMAP */ /* free child extents covered by parent */ xadlock.xdlist = &p->xad[XTENTRYSTART]; xadlock.count = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; txFreeMap(ip, (struct maplock *) & xadlock, NULL, COMMIT_WMAP); } BT_MARK_DIRTY(mp, ip); if (p->header.flag & BT_ROOT) { p->header.flag &= ~BT_INTERNAL; p->header.flag |= BT_LEAF; p->header.nextindex = cpu_to_le16(XTENTRYSTART); if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) { /* * Shrink root down to allow inline * EA (otherwise fsck complains) */ p->header.maxentry = cpu_to_le16(XTROOTINITSLOT); JFS_IP(ip)->mode2 |= INLINEEA; } XT_PUTPAGE(mp); /* debug */ goto out; } else { if (log) { /* COMMIT_PWMAP */ /* page will be invalidated at tx completion */ XT_PUTPAGE(mp); } else { /* COMMIT_WMAP */ if (mp->lid) lid_to_tlock(mp->lid)->flag |= tlckFREELOCK; /* invalidate parent page */ discard_metapage(mp); } /* parent has become empty and freed: * go back up to its parent page */ /* freed = 1; */ goto getParent; } } /* * parent page still has entries for front region; */ else { /* try truncate region covered by preceding entry * (process backward) */ index--; /* go back down to the child page corresponding * to the entry */ goto getChild; } /* * internal page: go down to child page of current entry */ getChild: /* save current parent entry for the child page */ if (BT_STACK_FULL(&btstack)) { jfs_error(ip->i_sb, "stack overrun!\n"); XT_PUTPAGE(mp); return -EIO; } BT_PUSH(&btstack, bn, index); /* get child page */ xad = &p->xad[index]; bn = addressXAD(xad); /* * first access of each internal entry: */ /* release parent page */ XT_PUTPAGE(mp); /* process the child page */ goto getPage; out: /* * update file resource stat */ /* set size */ if (S_ISDIR(ip->i_mode) && !newsize) ip->i_size = 1; /* fsck hates zero-length directories */ else ip->i_size = newsize; /* update quota allocation to reflect freed blocks */ dquot_free_block(ip, nfreed); /* * free tlock of invalidated pages */ if (flag == COMMIT_WMAP) txFreelock(ip); return newsize; } /* * xtTruncate_pmap() * * function: * Perform truncate to zero length for deleted file, leaving the * the xtree and working map untouched. This allows the file to * be accessed via open file handles, while the delete of the file * is committed to disk. * * parameter: * tid_t tid, * struct inode *ip, * s64 committed_size) * * return: new committed size * * note: * * To avoid deadlock by holding too many transaction locks, the * truncation may be broken up into multiple transactions. * The committed_size keeps track of part of the file has been * freed from the pmaps. */ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) { s64 bn; struct btstack btstack; int cmp; int index; int locked_leaves = 0; struct metapage *mp; xtpage_t *p; struct btframe *parent; int rc; struct tblock *tblk; struct tlock *tlck = NULL; xad_t *xad; int xlen; s64 xoff; struct xtlock *xtlck = NULL; /* save object truncation type */ tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_PMAP; /* clear stack */ BT_CLR(&btstack); if (committed_size) { xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1; rc = xtSearch(ip, xoff, NULL, &cmp, &btstack, 0); if (rc) return rc; XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); if (cmp != 0) { XT_PUTPAGE(mp); jfs_error(ip->i_sb, "did not find extent\n"); return -EIO; } } else { /* * start with root * * root resides in the inode */ bn = 0; /* * first access of each page: */ getPage: XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; /* process entries backward from last index */ index = le16_to_cpu(p->header.nextindex) - 1; if (p->header.flag & BT_INTERNAL) goto getChild; } /* * leaf page */ if (++locked_leaves > MAX_TRUNCATE_LEAVES) { /* * We need to limit the size of the transaction * to avoid exhausting pagecache & tlocks */ xad = &p->xad[index]; xoff = offsetXAD(xad); xlen = lengthXAD(xad); XT_PUTPAGE(mp); return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; } tlck = txLock(tid, ip, mp, tlckXTREE); tlck->type = tlckXTREE | tlckFREE; xtlck = (struct xtlock *) & tlck->lock; xtlck->hwm.offset = index; XT_PUTPAGE(mp); /* * go back up to the parent page */ getParent: /* pop/restore parent entry for the current child page */ if ((parent = BT_POP(&btstack)) == NULL) /* current page must have been root */ goto out; /* get back the parent page */ bn = parent->bn; XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); if (rc) return rc; index = parent->index; /* * parent page become empty: free the page */ if (index == XTENTRYSTART) { /* txCommit() with tlckFREE: * free child extents covered by parent; * invalidate parent if COMMIT_PWMAP; */ tlck = txLock(tid, ip, mp, tlckXTREE); xtlck = (struct xtlock *) & tlck->lock; xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; tlck->type = tlckXTREE | tlckFREE; XT_PUTPAGE(mp); if (p->header.flag & BT_ROOT) { goto out; } else { goto getParent; } } /* * parent page still has entries for front region; */ else index--; /* * internal page: go down to child page of current entry */ getChild: /* save current parent entry for the child page */ if (BT_STACK_FULL(&btstack)) { jfs_error(ip->i_sb, "stack overrun!\n"); XT_PUTPAGE(mp); return -EIO; } BT_PUSH(&btstack, bn, index); /* get child page */ xad = &p->xad[index]; bn = addressXAD(xad); /* * first access of each internal entry: */ /* release parent page */ XT_PUTPAGE(mp); /* process the child page */ goto getPage; out: return 0; } #ifdef CONFIG_JFS_STATISTICS static int jfs_xtstat_proc_show(struct seq_file *m, void *v) { seq_printf(m, "JFS Xtree statistics\n" "====================\n" "searches = %d\n" "fast searches = %d\n" "splits = %d\n", xtStat.search, xtStat.fastSearch, xtStat.split); return 0; } static int jfs_xtstat_proc_open(struct inode *inode, struct file *file) { return single_open(file, jfs_xtstat_proc_show, NULL); } const struct file_operations jfs_xtstat_proc_fops = { .open = jfs_xtstat_proc_open, .read = seq_read, .llseek = seq_lseek, .release = single_release, }; #endif
101 121 105 122 102 46 46 46 38 46 65 65 65 7 3 49 54 56 6 65 79 12 114 123 65 33 28 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 /* * fs/f2fs/node.h * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ #define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ #define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 #define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) #define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* control the memory footprint threshold (10MB per 1GB ram) */ #define DEF_RAM_THRESHOLD 1 /* control dirty nats ratio threshold (default: 10% over max nid count) */ #define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 /* control total # of nats */ #define DEF_NAT_CACHE_THRESHOLD 100000 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 #define SETVEC_SIZE 32 /* return value for read_node_page */ #define LOCKED_PAGE 1 /* For flag in struct node_info */ enum { IS_CHECKPOINTED, /* is it checkpointed before? */ HAS_FSYNCED_INODE, /* is the inode fsynced before? */ HAS_LAST_FSYNC, /* has the latest node fsync mark? */ IS_DIRTY, /* this nat entry is dirty? */ }; /* * For node information */ struct node_info { nid_t nid; /* node id */ nid_t ino; /* inode number of the node's owner */ block_t blk_addr; /* block address of the node */ unsigned char version; /* version of the node */ unsigned char flag; /* for node information bits */ }; struct nat_entry { struct list_head list; /* for clean or dirty nat list */ struct node_info ni; /* in-memory node information */ }; #define nat_get_nid(nat) ((nat)->ni.nid) #define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) #define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) #define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) #define nat_get_ino(nat) ((nat)->ni.ino) #define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) #define nat_get_version(nat) ((nat)->ni.version) #define nat_set_version(nat, v) ((nat)->ni.version = (v)) #define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) { dst->nid = src->nid; dst->ino = src->ino; dst->blk_addr = src->blk_addr; dst->version = src->version; /* should not copy flag here */ } static inline void set_nat_flag(struct nat_entry *ne, unsigned int type, bool set) { unsigned char mask = 0x01 << type; if (set) ne->ni.flag |= mask; else ne->ni.flag &= ~mask; } static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) { unsigned char mask = 0x01 << type; return ne->ni.flag & mask; } static inline void nat_reset_flag(struct nat_entry *ne) { /* these states can be set only after checkpoint was done */ set_nat_flag(ne, IS_CHECKPOINTED, true); set_nat_flag(ne, HAS_FSYNCED_INODE, false); set_nat_flag(ne, HAS_LAST_FSYNC, true); } static inline void node_info_from_raw_nat(struct node_info *ni, struct f2fs_nat_entry *raw_ne) { ni->ino = le32_to_cpu(raw_ne->ino); ni->blk_addr = le32_to_cpu(raw_ne->block_addr); ni->version = raw_ne->version; } static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, struct node_info *ni) { raw_ne->ino = cpu_to_le32(ni->ino); raw_ne->block_addr = cpu_to_le32(ni->blk_addr); raw_ne->version = ni->version; } static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) { return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * NM_I(sbi)->dirty_nats_ratio / 100; } static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) { return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; } enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ BASE_CHECK, /* check kernel status */ }; struct nat_entry_set { struct list_head set_list; /* link with other nat sets */ struct list_head entry_list; /* link with dirty nat entries */ nid_t set; /* set number*/ unsigned int entry_cnt; /* the # of nat entries in set */ }; /* * For free nid mangement */ enum nid_state { NID_NEW, /* newly added to free nid list */ NID_ALLOC /* it is allocated */ }; struct free_nid { struct list_head list; /* for free node id list */ nid_t nid; /* node id */ int state; /* in use or not: NID_NEW or NID_ALLOC */ }; static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; spin_lock(&nm_i->nid_list_lock); if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { spin_unlock(&nm_i->nid_list_lock); return; } fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); } /* * inline functions */ static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, nm_i->bitmap_size)) f2fs_bug_on(sbi, 1); #endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) { struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; /* * block_off = segment_off * 512 + off_in_segment * OLD = (segment_off * 512) * 2 + off_in_segment * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment */ block_off = NAT_BLOCK_OFFSET(start); block_addr = (pgoff_t)(nm_i->nat_blkaddr + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; return block_addr; } static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, pgoff_t block_addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; block_addr ^= 1 << sbi->log_blocks_per_seg; return block_addr + nm_i->nat_blkaddr; } static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) { unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); f2fs_change_bit(block_off, nm_i->nat_bitmap); #ifdef CONFIG_F2FS_CHECK_FS f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); #endif } static inline nid_t ino_of_node(struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.ino); } static inline nid_t nid_of_node(struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.nid); } static inline unsigned int ofs_of_node(struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); unsigned flag = le32_to_cpu(rn->footer.flag); return flag >> OFFSET_BIT_SHIFT; } static inline __u64 cpver_of_node(struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); return le64_to_cpu(rn->footer.cp_ver); } static inline block_t next_blkaddr_of_node(struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); return le32_to_cpu(rn->footer.next_blkaddr); } static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { struct f2fs_node *rn = F2FS_NODE(page); unsigned int old_flag = 0; if (reset) memset(rn, 0, sizeof(*rn)); else old_flag = le32_to_cpu(rn->footer.flag); rn->footer.nid = cpu_to_le32(nid); rn->footer.ino = cpu_to_le32(ino); /* should remain old flag bits such as COLD_BIT_SHIFT */ rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) | (old_flag & OFFSET_BIT_MASK)); } static inline void copy_node_footer(struct page *dst, struct page *src) { struct f2fs_node *src_rn = F2FS_NODE(src); struct f2fs_node *dst_rn = F2FS_NODE(dst); memcpy(&dst_rn->footer, &src_rn->footer, sizeof(struct node_footer)); } static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); __u64 cp_ver = cur_cp_version(ckpt); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } static inline bool is_recoverable_dnode(struct page *page) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); __u64 cp_ver = cur_cp_version(ckpt); if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) cp_ver |= (cur_cp_crc(ckpt) << 32); return cp_ver == cpver_of_node(page); } /* * f2fs assigns the following node offsets described as (num). * N = NIDS_PER_BLOCK * * Inode block (0) * |- direct node (1) * |- direct node (2) * |- indirect node (3) * | `- direct node (4 => 4 + N - 1) * |- indirect node (4 + N) * | `- direct node (5 + N => 5 + 2N - 1) * `- double indirect node (5 + 2N) * `- indirect node (6 + 2N) * `- direct node * ...... * `- indirect node ((6 + 2N) + x(N + 1)) * `- direct node * ...... * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ static inline bool IS_DNODE(struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); if (f2fs_has_xattr_block(ofs)) return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) return false; if (ofs >= 6 + 2 * NIDS_PER_BLOCK) { ofs -= 6 + 2 * NIDS_PER_BLOCK; if (!((long int)ofs % (NIDS_PER_BLOCK + 1))) return false; } return true; } static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); f2fs_wait_on_page_writeback(p, NODE, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) { struct f2fs_node *rn = F2FS_NODE(p); if (i) return le32_to_cpu(rn->i.i_nid[off - NODE_DIR1_BLOCK]); return le32_to_cpu(rn->in.nid[off]); } /* * Coldness identification: * - Mark cold files in f2fs_inode_info * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ static inline int is_cold_data(struct page *page) { return PageChecked(page); } static inline void set_cold_data(struct page *page) { SetPageChecked(page); } static inline void clear_cold_data(struct page *page) { ClearPageChecked(page); } static inline int is_node(struct page *page, int type) { struct f2fs_node *rn = F2FS_NODE(page); return le32_to_cpu(rn->footer.flag) & (1 << type); } #define is_cold_node(page) is_node(page, COLD_BIT_SHIFT) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) static inline int is_inline_node(struct page *page) { return PageChecked(page); } static inline void set_inline_node(struct page *page) { SetPageChecked(page); } static inline void clear_inline_node(struct page *page) { ClearPageChecked(page); } static inline void set_cold_node(struct inode *inode, struct page *page) { struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (S_ISDIR(inode->i_mode)) flag &= ~(0x1 << COLD_BIT_SHIFT); else flag |= (0x1 << COLD_BIT_SHIFT); rn->footer.flag = cpu_to_le32(flag); } static inline void set_mark(struct page *page, int mark, int type) { struct f2fs_node *rn = F2FS_NODE(page); unsigned int flag = le32_to_cpu(rn->footer.flag); if (mark) flag |= (0x1 << type); else flag &= ~(0x1 << type); rn->footer.flag = cpu_to_le32(flag); } #define set_dentry_mark(page, mark) set_mark(page, mark, DENT_BIT_SHIFT) #define set_fsync_mark(page, mark) set_mark(page, mark, FSYNC_BIT_SHIFT)
4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/rose.h> static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose); /* * This routine purges all of the queues of frames. */ void rose_clear_queues(struct sock *sk) { skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&rose_sk(sk)->ack_queue); } /* * This routine purges the input queue of those frames that have been * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the * SDL diagram. */ void rose_frames_acked(struct sock *sk, unsigned short nr) { struct sk_buff *skb; struct rose_sock *rose = rose_sk(sk); /* * Remove all the ack-ed frames from the ack queue. */ if (rose->va != nr) { while (skb_peek(&rose->ack_queue) != NULL && rose->va != nr) { skb = skb_dequeue(&rose->ack_queue); kfree_skb(skb); rose->va = (rose->va + 1) % ROSE_MODULUS; } } } void rose_requeue_frames(struct sock *sk) { struct sk_buff *skb, *skb_prev = NULL; /* * Requeue all the un-ack-ed frames on the output queue to be picked * up by rose_kick. This arrangement handles the possibility of an * empty output queue. */ while ((skb = skb_dequeue(&rose_sk(sk)->ack_queue)) != NULL) { if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } /* * Validate that the value of nr is between va and vs. Return true or * false for testing. */ int rose_validate_nr(struct sock *sk, unsigned short nr) { struct rose_sock *rose = rose_sk(sk); unsigned short vc = rose->va; while (vc != rose->vs) { if (nr == vc) return 1; vc = (vc + 1) % ROSE_MODULUS; } return nr == rose->vs; } /* * This routine is called when the packet layer internally generates a * control frame. */ void rose_write_internal(struct sock *sk, int frametype) { struct rose_sock *rose = rose_sk(sk); struct sk_buff *skb; unsigned char *dptr; unsigned char lci1, lci2; int maxfaclen = 0; int len, faclen; int reserve; reserve = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1; len = ROSE_MIN_LEN; switch (frametype) { case ROSE_CALL_REQUEST: len += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN; maxfaclen = 256; break; case ROSE_CALL_ACCEPTED: case ROSE_CLEAR_REQUEST: case ROSE_RESET_REQUEST: len += 2; break; } skb = alloc_skb(reserve + len + maxfaclen, GFP_ATOMIC); if (!skb) return; /* * Space for AX.25 header and PID. */ skb_reserve(skb, reserve); dptr = skb_put(skb, len); lci1 = (rose->lci >> 8) & 0x0F; lci2 = (rose->lci >> 0) & 0xFF; switch (frametype) { case ROSE_CALL_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = ROSE_CALL_REQ_ADDR_LEN_VAL; memcpy(dptr, &rose->dest_addr, ROSE_ADDR_LEN); dptr += ROSE_ADDR_LEN; memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN); dptr += ROSE_ADDR_LEN; faclen = rose_create_facilities(dptr, rose); skb_put(skb, faclen); dptr += faclen; break; case ROSE_CALL_ACCEPTED: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = 0x00; /* Address length */ *dptr++ = 0; /* Facilities length */ break; case ROSE_CLEAR_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = rose->cause; *dptr++ = rose->diagnostic; break; case ROSE_RESET_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = ROSE_DTE_ORIGINATED; *dptr++ = 0; break; case ROSE_RR: case ROSE_RNR: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr = frametype; *dptr++ |= (rose->vr << 5) & 0xE0; break; case ROSE_CLEAR_CONFIRMATION: case ROSE_RESET_CONFIRMATION: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; break; default: printk(KERN_ERR "ROSE: rose_write_internal - invalid frametype %02X\n", frametype); kfree_skb(skb); return; } rose_transmit_link(skb, rose->neighbour); } int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m) { unsigned char *frame; frame = skb->data; *ns = *nr = *q = *d = *m = 0; switch (frame[2]) { case ROSE_CALL_REQUEST: case ROSE_CALL_ACCEPTED: case ROSE_CLEAR_REQUEST: case ROSE_CLEAR_CONFIRMATION: case ROSE_RESET_REQUEST: case ROSE_RESET_CONFIRMATION: return frame[2]; default: break; } if ((frame[2] & 0x1F) == ROSE_RR || (frame[2] & 0x1F) == ROSE_RNR) { *nr = (frame[2] >> 5) & 0x07; return frame[2] & 0x1F; } if ((frame[2] & 0x01) == ROSE_DATA) { *q = (frame[0] & ROSE_Q_BIT) == ROSE_Q_BIT; *d = (frame[0] & ROSE_D_BIT) == ROSE_D_BIT; *m = (frame[2] & ROSE_M_BIT) == ROSE_M_BIT; *nr = (frame[2] >> 5) & 0x07; *ns = (frame[2] >> 1) & 0x07; return ROSE_DATA; } return ROSE_ILLEGAL; } static int rose_parse_national(unsigned char *p, struct rose_facilities_struct *facilities, int len) { unsigned char *pt; unsigned char l, lg, n = 0; int fac_national_digis_received = 0; do { switch (*p & 0xC0) { case 0x00: if (len < 2) return -1; p += 2; n += 2; len -= 2; break; case 0x40: if (len < 3) return -1; if (*p == FAC_NATIONAL_RAND) facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF); p += 3; n += 3; len -= 3; break; case 0x80: if (len < 4) return -1; p += 4; n += 4; len -= 4; break; case 0xC0: if (len < 2) return -1; l = p[1]; if (len < 2 + l) return -1; if (*p == FAC_NATIONAL_DEST_DIGI) { if (!fac_national_digis_received) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN); facilities->source_ndigis = 1; } } else if (*p == FAC_NATIONAL_SRC_DIGI) { if (!fac_national_digis_received) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN); facilities->dest_ndigis = 1; } } else if (*p == FAC_NATIONAL_FAIL_CALL) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN); } else if (*p == FAC_NATIONAL_FAIL_ADD) { if (l < 1 + ROSE_ADDR_LEN) return -1; memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN); } else if (*p == FAC_NATIONAL_DIGIS) { if (l % AX25_ADDR_LEN) return -1; fac_national_digis_received = 1; facilities->source_ndigis = 0; facilities->dest_ndigis = 0; for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) { if (pt[6] & AX25_HBIT) { if (facilities->dest_ndigis >= ROSE_MAX_DIGIS) return -1; memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN); } else { if (facilities->source_ndigis >= ROSE_MAX_DIGIS) return -1; memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN); } } } p += l + 2; n += l + 2; len -= l + 2; break; } } while (*p != 0x00 && len > 0); return n; } static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *facilities, int len) { unsigned char l, n = 0; char callsign[11]; do { switch (*p & 0xC0) { case 0x00: if (len < 2) return -1; p += 2; n += 2; len -= 2; break; case 0x40: if (len < 3) return -1; p += 3; n += 3; len -= 3; break; case 0x80: if (len < 4) return -1; p += 4; n += 4; len -= 4; break; case 0xC0: if (len < 2) return -1; l = p[1]; /* Prevent overflows*/ if (l < 10 || l > 20) return -1; if (*p == FAC_CCITT_DEST_NSAP) { memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN); memcpy(callsign, p + 12, l - 10); callsign[l - 10] = '\0'; asc2ax(&facilities->source_call, callsign); } if (*p == FAC_CCITT_SRC_NSAP) { memcpy(&facilities->dest_addr, p + 7, ROSE_ADDR_LEN); memcpy(callsign, p + 12, l - 10); callsign[l - 10] = '\0'; asc2ax(&facilities->dest_call, callsign); } p += l + 2; n += l + 2; len -= l + 2; break; } } while (*p != 0x00 && len > 0); return n; } int rose_parse_facilities(unsigned char *p, unsigned packet_len, struct rose_facilities_struct *facilities) { int facilities_len, len; facilities_len = *p++; if (facilities_len == 0 || (unsigned int)facilities_len > packet_len) return 0; while (facilities_len >= 3 && *p == 0x00) { facilities_len--; p++; switch (*p) { case FAC_NATIONAL: /* National */ len = rose_parse_national(p + 1, facilities, facilities_len - 1); break; case FAC_CCITT: /* CCITT */ len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); break; default: printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); len = 1; break; } if (len < 0) return 0; if (WARN_ON(len >= facilities_len)) return 0; facilities_len -= len + 1; p += len + 1; } return facilities_len == 0; } static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose) { unsigned char *p = buffer + 1; char *callsign; char buf[11]; int len, nb; /* National Facilities */ if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) { *p++ = 0x00; *p++ = FAC_NATIONAL; if (rose->rand != 0) { *p++ = FAC_NATIONAL_RAND; *p++ = (rose->rand >> 8) & 0xFF; *p++ = (rose->rand >> 0) & 0xFF; } /* Sent before older facilities */ if ((rose->source_ndigis > 0) || (rose->dest_ndigis > 0)) { int maxdigi = 0; *p++ = FAC_NATIONAL_DIGIS; *p++ = AX25_ADDR_LEN * (rose->source_ndigis + rose->dest_ndigis); for (nb = 0 ; nb < rose->source_ndigis ; nb++) { if (++maxdigi >= ROSE_MAX_DIGIS) break; memcpy(p, &rose->source_digis[nb], AX25_ADDR_LEN); p[6] |= AX25_HBIT; p += AX25_ADDR_LEN; } for (nb = 0 ; nb < rose->dest_ndigis ; nb++) { if (++maxdigi >= ROSE_MAX_DIGIS) break; memcpy(p, &rose->dest_digis[nb], AX25_ADDR_LEN); p[6] &= ~AX25_HBIT; p += AX25_ADDR_LEN; } } /* For compatibility */ if (rose->source_ndigis > 0) { *p++ = FAC_NATIONAL_SRC_DIGI; *p++ = AX25_ADDR_LEN; memcpy(p, &rose->source_digis[0], AX25_ADDR_LEN); p += AX25_ADDR_LEN; } /* For compatibility */ if (rose->dest_ndigis > 0) { *p++ = FAC_NATIONAL_DEST_DIGI; *p++ = AX25_ADDR_LEN; memcpy(p, &rose->dest_digis[0], AX25_ADDR_LEN); p += AX25_ADDR_LEN; } } *p++ = 0x00; *p++ = FAC_CCITT; *p++ = FAC_CCITT_DEST_NSAP; callsign = ax2asc(buf, &rose->dest_call); *p++ = strlen(callsign) + 10; *p++ = (strlen(callsign) + 9) * 2; /* ??? */ *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; *p++ = ROSE_ADDR_LEN * 2; memcpy(p, &rose->dest_addr, ROSE_ADDR_LEN); p += ROSE_ADDR_LEN; memcpy(p, callsign, strlen(callsign)); p += strlen(callsign); *p++ = FAC_CCITT_SRC_NSAP; callsign = ax2asc(buf, &rose->source_call); *p++ = strlen(callsign) + 10; *p++ = (strlen(callsign) + 9) * 2; /* ??? */ *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; *p++ = ROSE_ADDR_LEN * 2; memcpy(p, &rose->source_addr, ROSE_ADDR_LEN); p += ROSE_ADDR_LEN; memcpy(p, callsign, strlen(callsign)); p += strlen(callsign); len = p - buffer; buffer[0] = len - 1; return len; } void rose_disconnect(struct sock *sk, int reason, int cause, int diagnostic) { struct rose_sock *rose = rose_sk(sk); rose_stop_timer(sk); rose_stop_idletimer(sk); rose_clear_queues(sk); rose->lci = 0; rose->state = ROSE_STATE_0; if (cause != -1) rose->cause = cause; if (diagnostic != -1) rose->diagnostic = diagnostic; sk->sk_state = TCP_CLOSE; sk->sk_err = reason; sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } }
6 6 5 5 2 2 3 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 /* RxRPC recvmsg() implementation * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/skbuff.h> #include <linux/export.h> #include <linux/sched/signal.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" /* * Post a call for attention by the socket or kernel service. Further * notifications are suppressed by putting recvmsg_link on a dummy queue. */ void rxrpc_notify_socket(struct rxrpc_call *call) { struct rxrpc_sock *rx; struct sock *sk; _enter("%d", call->debug_id); if (!list_empty(&call->recvmsg_link)) return; rcu_read_lock(); rx = rcu_dereference(call->socket); sk = &rx->sk; if (rx && sk->sk_state < RXRPC_CLOSE) { if (call->notify_rx) { call->notify_rx(sk, call, call->user_call_ID); } else { write_lock_bh(&rx->recvmsg_lock); if (list_empty(&call->recvmsg_link)) { rxrpc_get_call(call, rxrpc_call_got); list_add_tail(&call->recvmsg_link, &rx->recvmsg_q); } write_unlock_bh(&rx->recvmsg_lock); if (!sock_flag(sk, SOCK_DEAD)) { _debug("call %ps", sk->sk_data_ready); sk->sk_data_ready(sk); } } } rcu_read_unlock(); _leave(""); } /* * Pass a call terminating message to userspace. */ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) { u32 tmp = 0; int ret; switch (call->completion) { case RXRPC_CALL_SUCCEEDED: ret = 0; if (rxrpc_is_service_call(call)) ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ACK, 0, &tmp); break; case RXRPC_CALL_REMOTELY_ABORTED: tmp = call->abort_code; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); break; case RXRPC_CALL_LOCALLY_ABORTED: tmp = call->abort_code; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); break; case RXRPC_CALL_NETWORK_ERROR: tmp = -call->error; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp); break; case RXRPC_CALL_LOCAL_ERROR: tmp = -call->error; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp); break; default: pr_err("Invalid terminal call state %u\n", call->state); BUG(); break; } trace_rxrpc_recvmsg(call, rxrpc_recvmsg_terminal, call->rx_hard_ack, call->rx_pkt_offset, call->rx_pkt_len, ret); return ret; } /* * Pass back notification of a new call. The call is added to the * to-be-accepted list. This means that the next call to be accepted might not * be the last call seen awaiting acceptance, but unless we leave this on the * front of the queue and block all other messages until someone gives us a * user_ID for it, there's not a lot we can do. */ static int rxrpc_recvmsg_new_call(struct rxrpc_sock *rx, struct rxrpc_call *call, struct msghdr *msg, int flags) { int tmp = 0, ret; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NEW_CALL, 0, &tmp); if (ret == 0 && !(flags & MSG_PEEK)) { _debug("to be accepted"); write_lock_bh(&rx->recvmsg_lock); list_del_init(&call->recvmsg_link); write_unlock_bh(&rx->recvmsg_lock); rxrpc_get_call(call, rxrpc_call_got); write_lock(&rx->call_lock); list_add_tail(&call->accept_link, &rx->to_be_accepted); write_unlock(&rx->call_lock); } trace_rxrpc_recvmsg(call, rxrpc_recvmsg_to_be_accepted, 1, 0, 0, ret); return ret; } /* * End the packet reception phase. */ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) { _enter("%d,%s", call->debug_id, rxrpc_call_states[call->state]); trace_rxrpc_receive(call, rxrpc_receive_end, 0, call->rx_top); ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, true, false, rxrpc_propose_ack_terminal_ack); rxrpc_send_ack_packet(call, false); } write_lock_bh(&call->state_lock); switch (call->state) { case RXRPC_CALL_CLIENT_RECV_REPLY: __rxrpc_call_completed(call); write_unlock_bh(&call->state_lock); break; case RXRPC_CALL_SERVER_RECV_REQUEST: call->tx_phase = true; call->state = RXRPC_CALL_SERVER_ACK_REQUEST; call->ack_at = call->expire_at; write_unlock_bh(&call->state_lock); rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true, rxrpc_propose_ack_processing_op); break; default: write_unlock_bh(&call->state_lock); break; } } /* * Discard a packet we've used up and advance the Rx window by one. */ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; u8 flags; int ix; _enter("%d", call->debug_id); hard_ack = call->rx_hard_ack; top = smp_load_acquire(&call->rx_top); ASSERT(before(hard_ack, top)); hard_ack++; ix = hard_ack & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; rxrpc_see_skb(skb, rxrpc_skb_rx_rotated); sp = rxrpc_skb(skb); flags = sp->hdr.flags; serial = sp->hdr.serial; if (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) serial += (call->rxtx_annotations[ix] & RXRPC_RX_ANNO_JUMBO) - 1; call->rxtx_buffer[ix] = NULL; call->rxtx_annotations[ix] = 0; /* Barrier against rxrpc_input_data(). */ smp_store_release(&call->rx_hard_ack, hard_ack); rxrpc_free_skb(skb, rxrpc_skb_rx_freed); _debug("%u,%u,%02x", hard_ack, top, flags); trace_rxrpc_receive(call, rxrpc_receive_rotate, serial, hard_ack); if (flags & RXRPC_LAST_PACKET) { rxrpc_end_rx_phase(call, serial); } else { /* Check to see if there's an ACK that needs sending. */ if (after_eq(hard_ack, call->ackr_consumed + 2) || after_eq(top, call->ackr_seen + 2) || (hard_ack == top && after(hard_ack, call->ackr_consumed))) rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, true, false, rxrpc_propose_ack_rotate_rx); if (call->ackr_reason) rxrpc_send_ack_packet(call, false); } } /* * Decrypt and verify a (sub)packet. The packet's length may be changed due to * padding, but if this is the case, the packet length will be resident in the * socket buffer. Note that we can't modify the master skb info as the skb may * be the home to multiple subpackets. */ static int rxrpc_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, u8 annotation, unsigned int offset, unsigned int len) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); rxrpc_seq_t seq = sp->hdr.seq; u16 cksum = sp->hdr.cksum; _enter(""); /* For all but the head jumbo subpacket, the security checksum is in a * jumbo header immediately prior to the data. */ if ((annotation & RXRPC_RX_ANNO_JUMBO) > 1) { __be16 tmp; if (skb_copy_bits(skb, offset - 2, &tmp, 2) < 0) BUG(); cksum = ntohs(tmp); seq += (annotation & RXRPC_RX_ANNO_JUMBO) - 1; } return call->conn->security->verify_packet(call, skb, offset, len, seq, cksum); } /* * Locate the data within a packet. This is complicated by: * * (1) An skb may contain a jumbo packet - so we have to find the appropriate * subpacket. * * (2) The (sub)packets may be encrypted and, if so, the encrypted portion * contains an extra header which includes the true length of the data, * excluding any encrypted padding. */ static int rxrpc_locate_data(struct rxrpc_call *call, struct sk_buff *skb, u8 *_annotation, unsigned int *_offset, unsigned int *_len) { unsigned int offset = sizeof(struct rxrpc_wire_header); unsigned int len = *_len; int ret; u8 annotation = *_annotation; /* Locate the subpacket */ len = skb->len - offset; if ((annotation & RXRPC_RX_ANNO_JUMBO) > 0) { offset += (((annotation & RXRPC_RX_ANNO_JUMBO) - 1) * RXRPC_JUMBO_SUBPKTLEN); len = (annotation & RXRPC_RX_ANNO_JLAST) ? skb->len - offset : RXRPC_JUMBO_SUBPKTLEN; } if (!(annotation & RXRPC_RX_ANNO_VERIFIED)) { ret = rxrpc_verify_packet(call, skb, annotation, offset, len); if (ret < 0) return ret; *_annotation |= RXRPC_RX_ANNO_VERIFIED; } *_offset = offset; *_len = len; call->conn->security->locate_data(call, skb, _offset, _len); return 0; } /* * Deliver messages to a call. This keeps processing packets until the buffer * is filled and we find either more DATA (returns 0) or the end of the DATA * (returns 1). If more packets are required, it returns -EAGAIN. */ static int rxrpc_recvmsg_data(struct socket *sock, struct rxrpc_call *call, struct msghdr *msg, struct iov_iter *iter, size_t len, int flags, size_t *_offset) { struct rxrpc_skb_priv *sp; struct sk_buff *skb; rxrpc_seq_t hard_ack, top, seq; size_t remain; bool last; unsigned int rx_pkt_offset, rx_pkt_len; int ix, copy, ret = -EAGAIN, ret2; rx_pkt_offset = call->rx_pkt_offset; rx_pkt_len = call->rx_pkt_len; if (call->state >= RXRPC_CALL_SERVER_ACK_REQUEST) { seq = call->rx_hard_ack; ret = 1; goto done; } /* Barriers against rxrpc_input_data(). */ hard_ack = call->rx_hard_ack; seq = hard_ack + 1; while (top = smp_load_acquire(&call->rx_top), before_eq(seq, top) ) { ix = seq & RXRPC_RXTX_BUFF_MASK; skb = call->rxtx_buffer[ix]; if (!skb) { trace_rxrpc_recvmsg(call, rxrpc_recvmsg_hole, seq, rx_pkt_offset, rx_pkt_len, 0); break; } smp_rmb(); rxrpc_see_skb(skb, rxrpc_skb_rx_seen); sp = rxrpc_skb(skb); if (!(flags & MSG_PEEK)) trace_rxrpc_receive(call, rxrpc_receive_front, sp->hdr.serial, seq); if (msg) sock_recv_timestamp(msg, sock->sk, skb); if (rx_pkt_offset == 0) { ret2 = rxrpc_locate_data(call, skb, &call->rxtx_annotations[ix], &rx_pkt_offset, &rx_pkt_len); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_next, seq, rx_pkt_offset, rx_pkt_len, ret2); if (ret2 < 0) { ret = ret2; goto out; } } else { trace_rxrpc_recvmsg(call, rxrpc_recvmsg_cont, seq, rx_pkt_offset, rx_pkt_len, 0); } /* We have to handle short, empty and used-up DATA packets. */ remain = len - *_offset; copy = rx_pkt_len; if (copy > remain) copy = remain; if (copy > 0) { ret2 = skb_copy_datagram_iter(skb, rx_pkt_offset, iter, copy); if (ret2 < 0) { ret = ret2; goto out; } /* handle piecemeal consumption of data packets */ rx_pkt_offset += copy; rx_pkt_len -= copy; *_offset += copy; } if (rx_pkt_len > 0) { trace_rxrpc_recvmsg(call, rxrpc_recvmsg_full, seq, rx_pkt_offset, rx_pkt_len, 0); ASSERTCMP(*_offset, ==, len); ret = 0; break; } /* The whole packet has been transferred. */ last = sp->hdr.flags & RXRPC_LAST_PACKET; if (!(flags & MSG_PEEK)) rxrpc_rotate_rx_window(call); rx_pkt_offset = 0; rx_pkt_len = 0; if (last) { ASSERTCMP(seq, ==, READ_ONCE(call->rx_top)); ret = 1; goto out; } seq++; } out: if (!(flags & MSG_PEEK)) { call->rx_pkt_offset = rx_pkt_offset; call->rx_pkt_len = rx_pkt_len; } done: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_data_return, seq, rx_pkt_offset, rx_pkt_len, ret); return ret; } /* * Receive a message from an RxRPC socket * - we need to be careful about two or more threads calling recvmsg * simultaneously */ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct rxrpc_call *call; struct rxrpc_sock *rx = rxrpc_sk(sock->sk); struct list_head *l; size_t copied = 0; long timeo; int ret; DEFINE_WAIT(wait); trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_enter, 0, 0, 0, 0); if (flags & (MSG_OOB | MSG_TRUNC)) return -EOPNOTSUPP; timeo = sock_rcvtimeo(&rx->sk, flags & MSG_DONTWAIT); try_again: lock_sock(&rx->sk); /* Return immediately if a client socket has no outstanding calls */ if (RB_EMPTY_ROOT(&rx->calls) && list_empty(&rx->recvmsg_q) && rx->sk.sk_state != RXRPC_SERVER_LISTENING) { release_sock(&rx->sk); return -EAGAIN; } if (list_empty(&rx->recvmsg_q)) { ret = -EWOULDBLOCK; if (timeo == 0) { call = NULL; goto error_no_call; } release_sock(&rx->sk); /* Wait for something to happen */ prepare_to_wait_exclusive(sk_sleep(&rx->sk), &wait, TASK_INTERRUPTIBLE); ret = sock_error(&rx->sk); if (ret) goto wait_error; if (list_empty(&rx->recvmsg_q)) { if (signal_pending(current)) goto wait_interrupted; trace_rxrpc_recvmsg(NULL, rxrpc_recvmsg_wait, 0, 0, 0, 0); timeo = schedule_timeout(timeo); } finish_wait(sk_sleep(&rx->sk), &wait); goto try_again; } /* Find the next call and dequeue it if we're not just peeking. If we * do dequeue it, that comes with a ref that we will need to release. */ write_lock_bh(&rx->recvmsg_lock); l = rx->recvmsg_q.next; call = list_entry(l, struct rxrpc_call, recvmsg_link); if (!(flags & MSG_PEEK)) list_del_init(&call->recvmsg_link); else rxrpc_get_call(call, rxrpc_call_got); write_unlock_bh(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_dequeue, 0, 0, 0, 0); /* We're going to drop the socket lock, so we need to lock the call * against interference by sendmsg. */ if (!mutex_trylock(&call->user_mutex)) { ret = -EWOULDBLOCK; if (flags & MSG_DONTWAIT) goto error_requeue_call; ret = -ERESTARTSYS; if (mutex_lock_interruptible(&call->user_mutex) < 0) goto error_requeue_call; } release_sock(&rx->sk); if (test_bit(RXRPC_CALL_RELEASED, &call->flags)) BUG(); if (test_bit(RXRPC_CALL_HAS_USERID, &call->flags)) { if (flags & MSG_CMSG_COMPAT) { unsigned int id32 = call->user_call_ID; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, sizeof(unsigned int), &id32); } else { unsigned long idl = call->user_call_ID; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_USER_CALL_ID, sizeof(unsigned long), &idl); } if (ret < 0) goto error_unlock_call; } if (msg->msg_name && call->peer) { struct sockaddr_rxrpc *srx = msg->msg_name; size_t len = sizeof(call->peer->srx); memcpy(msg->msg_name, &call->peer->srx, len); srx->srx_service = call->service_id; msg->msg_namelen = len; } switch (READ_ONCE(call->state)) { case RXRPC_CALL_SERVER_ACCEPTING: ret = rxrpc_recvmsg_new_call(rx, call, msg, flags); break; case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: ret = rxrpc_recvmsg_data(sock, call, msg, &msg->msg_iter, len, flags, &copied); if (ret == -EAGAIN) ret = 0; if (after(call->rx_top, call->rx_hard_ack) && call->rxtx_buffer[(call->rx_hard_ack + 1) & RXRPC_RXTX_BUFF_MASK]) rxrpc_notify_socket(call); break; default: ret = 0; break; } if (ret < 0) goto error_unlock_call; if (call->state == RXRPC_CALL_COMPLETE) { ret = rxrpc_recvmsg_term(call, msg); if (ret < 0) goto error_unlock_call; if (!(flags & MSG_PEEK)) rxrpc_release_call(rx, call); msg->msg_flags |= MSG_EOR; ret = 1; } if (ret == 0) msg->msg_flags |= MSG_MORE; else msg->msg_flags &= ~MSG_MORE; ret = copied; error_unlock_call: mutex_unlock(&call->user_mutex); rxrpc_put_call(call, rxrpc_call_put); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); return ret; error_requeue_call: if (!(flags & MSG_PEEK)) { write_lock_bh(&rx->recvmsg_lock); list_add(&call->recvmsg_link, &rx->recvmsg_q); write_unlock_bh(&rx->recvmsg_lock); trace_rxrpc_recvmsg(call, rxrpc_recvmsg_requeue, 0, 0, 0, 0); } else { rxrpc_put_call(call, rxrpc_call_put); } error_no_call: release_sock(&rx->sk); error_trace: trace_rxrpc_recvmsg(call, rxrpc_recvmsg_return, 0, 0, 0, ret); return ret; wait_interrupted: ret = sock_intr_errno(timeo); wait_error: finish_wait(sk_sleep(&rx->sk), &wait); call = NULL; goto error_trace; } /** * rxrpc_kernel_recv_data - Allow a kernel service to receive data/info * @sock: The socket that the call exists on * @call: The call to send data through * @buf: The buffer to receive into * @size: The size of the buffer, including data already read * @_offset: The running offset into the buffer. * @want_more: True if more data is expected to be read * @_abort: Where the abort code is stored if -ECONNABORTED is returned * * Allow a kernel service to receive data and pick up information about the * state of a call. Returns 0 if got what was asked for and there's more * available, 1 if we got what was asked for and we're at the end of the data * and -EAGAIN if we need more data. * * Note that we may return -EAGAIN to drain empty packets at the end of the * data, even if we've already copied over the requested data. * * This function adds the amount it transfers to *_offset, so this should be * precleared as appropriate. Note that the amount remaining in the buffer is * taken to be size - *_offset. * * *_abort should also be initialised to 0. */ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, void *buf, size_t size, size_t *_offset, bool want_more, u32 *_abort) { struct iov_iter iter; struct kvec iov; int ret; _enter("{%d,%s},%zu/%zu,%d", call->debug_id, rxrpc_call_states[call->state], *_offset, size, want_more); ASSERTCMP(*_offset, <=, size); ASSERTCMP(call->state, !=, RXRPC_CALL_SERVER_ACCEPTING); iov.iov_base = buf + *_offset; iov.iov_len = size - *_offset; iov_iter_kvec(&iter, ITER_KVEC | READ, &iov, 1, size - *_offset); mutex_lock(&call->user_mutex); switch (READ_ONCE(call->state)) { case RXRPC_CALL_CLIENT_RECV_REPLY: case RXRPC_CALL_SERVER_RECV_REQUEST: case RXRPC_CALL_SERVER_ACK_REQUEST: ret = rxrpc_recvmsg_data(sock, call, NULL, &iter, size, 0, _offset); if (ret < 0) goto out; /* We can only reach here with a partially full buffer if we * have reached the end of the data. We must otherwise have a * full buffer or have been given -EAGAIN. */ if (ret == 1) { if (*_offset < size) goto short_data; if (!want_more) goto read_phase_complete; ret = 0; goto out; } if (!want_more) goto excess_data; goto out; case RXRPC_CALL_COMPLETE: goto call_complete; default: ret = -EINPROGRESS; goto out; } read_phase_complete: ret = 1; out: mutex_unlock(&call->user_mutex); _leave(" = %d [%zu,%d]", ret, *_offset, *_abort); return ret; short_data: trace_rxrpc_rx_eproto(call, 0, tracepoint_string("short_data")); ret = -EBADMSG; goto out; excess_data: trace_rxrpc_rx_eproto(call, 0, tracepoint_string("excess_data")); ret = -EMSGSIZE; goto out; call_complete: *_abort = call->abort_code; ret = call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { ret = 1; if (size > 0) ret = -ECONNRESET; } goto out; } EXPORT_SYMBOL(rxrpc_kernel_recv_data);
9 22 9 9 22 9 23 2 2 21 21 2 19 31 31 31 30 9 9 1 1 1 1 37 8 1 1 30 29 29 37 1 37 36 35 34 4 33 3 33 5 33 1 32 4 33 32 32 31 1 32 30 31 30 25 32 28 31 27 31 31 31 31 31 25 6 29 30 38 30 31 31 30 31 31 31 25 6 31 25 31 31 30 31 31 31 31 31 23 33 33 35 33 5 35 31 3 3 3 3 2 7 3 3 2 2 7 1 1 1 71 72 72 64 72 65 64 64 9 1 197 177 176 176 176 176 175 23 197 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 /* * linux/drivers/video/vga16.c -- VGA 16-color framebuffer driver * * Copyright 1999 Ben Pfaff <pfaffben@debian.org> and Petr Vandrovec <VANDROVE@vc.cvut.cz> * Based on VGA info at http://www.goodnet.com/~tinara/FreeVGA/home.htm * Based on VESA framebuffer (c) 1998 Gerd Knorr <kraxel@goldbach.in-berlin.de> * * This file is subject to the terms and conditions of the GNU General * Public License. See the file COPYING in the main directory of this * archive for more details. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/delay.h> #include <linux/fb.h> #include <linux/ioport.h> #include <linux/init.h> #include <linux/platform_device.h> #include <linux/screen_info.h> #include <asm/io.h> #include <video/vga.h> #define VGA_FB_PHYS 0xA0000 #define VGA_FB_PHYS_LEN 65536 #define MODE_SKIP4 1 #define MODE_8BPP 2 #define MODE_CFB 4 #define MODE_TEXT 8 /* --------------------------------------------------------------------- */ /* * card parameters */ struct vga16fb_par { /* structure holding original VGA register settings when the screen is blanked */ struct { unsigned char SeqCtrlIndex; /* Sequencer Index reg. */ unsigned char CrtCtrlIndex; /* CRT-Contr. Index reg. */ unsigned char CrtMiscIO; /* Miscellaneous register */ unsigned char HorizontalTotal; /* CRT-Controller:00h */ unsigned char HorizDisplayEnd; /* CRT-Controller:01h */ unsigned char StartHorizRetrace;/* CRT-Controller:04h */ unsigned char EndHorizRetrace; /* CRT-Controller:05h */ unsigned char Overflow; /* CRT-Controller:07h */ unsigned char StartVertRetrace; /* CRT-Controller:10h */ unsigned char EndVertRetrace; /* CRT-Controller:11h */ unsigned char ModeControl; /* CRT-Controller:17h */ unsigned char ClockingMode; /* Seq-Controller:01h */ } vga_state; struct vgastate state; unsigned int ref_count; int palette_blanked, vesa_blanked, mode, isVGA; u8 misc, pel_msk, vss, clkdiv; u8 crtc[VGA_CRT_C]; }; /* --------------------------------------------------------------------- */ static struct fb_var_screeninfo vga16fb_defined = { .xres = 640, .yres = 480, .xres_virtual = 640, .yres_virtual = 480, .bits_per_pixel = 4, .activate = FB_ACTIVATE_TEST, .height = -1, .width = -1, .pixclock = 39721, .left_margin = 48, .right_margin = 16, .upper_margin = 33, .lower_margin = 10, .hsync_len = 96, .vsync_len = 2, .vmode = FB_VMODE_NONINTERLACED, }; /* name should not depend on EGA/VGA */ static const struct fb_fix_screeninfo vga16fb_fix = { .id = "VGA16 VGA", .smem_start = VGA_FB_PHYS, .smem_len = VGA_FB_PHYS_LEN, .type = FB_TYPE_VGA_PLANES, .type_aux = FB_AUX_VGA_PLANES_VGA4, .visual = FB_VISUAL_PSEUDOCOLOR, .xpanstep = 8, .ypanstep = 1, .line_length = 640 / 8, .accel = FB_ACCEL_NONE }; /* The VGA's weird architecture often requires that we read a byte and write a byte to the same location. It doesn't matter *what* byte we write, however. This is because all the action goes on behind the scenes in the VGA's 32-bit latch register, and reading and writing video memory just invokes latch behavior. To avoid race conditions (is this necessary?), reading and writing the memory byte should be done with a single instruction. One suitable instruction is the x86 bitwise OR. The following read-modify-write routine should optimize to one such bitwise OR. */ static inline void rmw(volatile char __iomem *p) { readb(p); writeb(1, p); } /* Set the Graphics Mode Register, and return its previous value. Bits 0-1 are write mode, bit 3 is read mode. */ static inline int setmode(int mode) { int oldmode; oldmode = vga_io_rgfx(VGA_GFX_MODE); vga_io_w(VGA_GFX_D, mode); return oldmode; } /* Select the Bit Mask Register and return its value. */ static inline int selectmask(void) { return vga_io_rgfx(VGA_GFX_BIT_MASK); } /* Set the value of the Bit Mask Register. It must already have been selected with selectmask(). */ static inline void setmask(int mask) { vga_io_w(VGA_GFX_D, mask); } /* Set the Data Rotate Register and return its old value. Bits 0-2 are rotate count, bits 3-4 are logical operation (0=NOP, 1=AND, 2=OR, 3=XOR). */ static inline int setop(int op) { int oldop; oldop = vga_io_rgfx(VGA_GFX_DATA_ROTATE); vga_io_w(VGA_GFX_D, op); return oldop; } /* Set the Enable Set/Reset Register and return its old value. The code here always uses value 0xf for this register. */ static inline int setsr(int sr) { int oldsr; oldsr = vga_io_rgfx(VGA_GFX_SR_ENABLE); vga_io_w(VGA_GFX_D, sr); return oldsr; } /* Set the Set/Reset Register and return its old value. */ static inline int setcolor(int color) { int oldcolor; oldcolor = vga_io_rgfx(VGA_GFX_SR_VALUE); vga_io_w(VGA_GFX_D, color); return oldcolor; } /* Return the value in the Graphics Address Register. */ static inline int getindex(void) { return vga_io_r(VGA_GFX_I); } /* Set the value in the Graphics Address Register. */ static inline void setindex(int index) { vga_io_w(VGA_GFX_I, index); } static void vga16fb_pan_var(struct fb_info *info, struct fb_var_screeninfo *var) { struct vga16fb_par *par = info->par; u32 xoffset, pos; xoffset = var->xoffset; if (info->var.bits_per_pixel == 8) { pos = (info->var.xres_virtual * var->yoffset + xoffset) >> 2; } else if (par->mode & MODE_TEXT) { int fh = 16; // FIXME !!! font height. Fugde for now. pos = (info->var.xres_virtual * (var->yoffset / fh) + xoffset) >> 3; } else { if (info->var.nonstd) xoffset--; pos = (info->var.xres_virtual * var->yoffset + xoffset) >> 3; } vga_io_wcrt(VGA_CRTC_START_HI, pos >> 8); vga_io_wcrt(VGA_CRTC_START_LO, pos & 0xFF); /* if we support CFB4, then we must! support xoffset with pixel * granularity if someone supports xoffset in bit resolution */ vga_io_r(VGA_IS1_RC); /* reset flip-flop */ vga_io_w(VGA_ATT_IW, VGA_ATC_PEL); if (info->var.bits_per_pixel == 8) vga_io_w(VGA_ATT_IW, (xoffset & 3) << 1); else vga_io_w(VGA_ATT_IW, xoffset & 7); vga_io_r(VGA_IS1_RC); vga_io_w(VGA_ATT_IW, 0x20); } static void vga16fb_update_fix(struct fb_info *info) { if (info->var.bits_per_pixel == 4) { if (info->var.nonstd) { info->fix.type = FB_TYPE_PACKED_PIXELS; info->fix.line_length = info->var.xres_virtual / 2; } else { info->fix.type = FB_TYPE_VGA_PLANES; info->fix.type_aux = FB_AUX_VGA_PLANES_VGA4; info->fix.line_length = info->var.xres_virtual / 8; } } else if (info->var.bits_per_pixel == 0) { info->fix.type = FB_TYPE_TEXT; info->fix.type_aux = FB_AUX_TEXT_CGA; info->fix.line_length = info->var.xres_virtual / 4; } else { /* 8bpp */ if (info->var.nonstd) { info->fix.type = FB_TYPE_VGA_PLANES; info->fix.type_aux = FB_AUX_VGA_PLANES_CFB8; info->fix.line_length = info->var.xres_virtual / 4; } else { info->fix.type = FB_TYPE_PACKED_PIXELS; info->fix.line_length = info->var.xres_virtual; } } } static void vga16fb_clock_chip(struct vga16fb_par *par, unsigned int *pixclock, const struct fb_info *info, int mul, int div) { static const struct { u32 pixclock; u8 misc; u8 seq_clock_mode; } *ptr, *best, vgaclocks[] = { { 79442 /* 12.587 */, 0x00, 0x08}, { 70616 /* 14.161 */, 0x04, 0x08}, { 39721 /* 25.175 */, 0x00, 0x00}, { 35308 /* 28.322 */, 0x04, 0x00}, { 0 /* bad */, 0x00, 0x00}}; int err; *pixclock = (*pixclock * mul) / div; best = vgaclocks; err = *pixclock - best->pixclock; if (err < 0) err = -err; for (ptr = vgaclocks + 1; ptr->pixclock; ptr++) { int tmp; tmp = *pixclock - ptr->pixclock; if (tmp < 0) tmp = -tmp; if (tmp < err) { err = tmp; best = ptr; } } par->misc |= best->misc; par->clkdiv = best->seq_clock_mode; *pixclock = (best->pixclock * div) / mul; } #define FAIL(X) return -EINVAL static int vga16fb_open(struct fb_info *info, int user) { struct vga16fb_par *par = info->par; if (!par->ref_count) { memset(&par->state, 0, sizeof(struct vgastate)); par->state.flags = VGA_SAVE_FONTS | VGA_SAVE_MODE | VGA_SAVE_CMAP; save_vga(&par->state); } par->ref_count++; return 0; } static int vga16fb_release(struct fb_info *info, int user) { struct vga16fb_par *par = info->par; if (!par->ref_count) return -EINVAL; if (par->ref_count == 1) restore_vga(&par->state); par->ref_count--; return 0; } static int vga16fb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) { struct vga16fb_par *par = info->par; u32 xres, right, hslen, left, xtotal; u32 yres, lower, vslen, upper, ytotal; u32 vxres, xoffset, vyres, yoffset; u32 pos; u8 r7, rMode; int shift; int mode; u32 maxmem; par->pel_msk = 0xFF; if (var->bits_per_pixel == 4) { if (var->nonstd) { if (!par->isVGA) return -EINVAL; shift = 3; mode = MODE_SKIP4 | MODE_CFB; maxmem = 16384; par->pel_msk = 0x0F; } else { shift = 3; mode = 0; maxmem = 65536; } } else if (var->bits_per_pixel == 8) { if (!par->isVGA) return -EINVAL; /* no support on EGA */ shift = 2; if (var->nonstd) { mode = MODE_8BPP | MODE_CFB; maxmem = 65536; } else { mode = MODE_SKIP4 | MODE_8BPP | MODE_CFB; maxmem = 16384; } } else return -EINVAL; xres = (var->xres + 7) & ~7; vxres = (var->xres_virtual + 0xF) & ~0xF; xoffset = (var->xoffset + 7) & ~7; left = (var->left_margin + 7) & ~7; right = (var->right_margin + 7) & ~7; hslen = (var->hsync_len + 7) & ~7; if (vxres < xres) vxres = xres; if (xres + xoffset > vxres) xoffset = vxres - xres; var->xres = xres; var->right_margin = right; var->hsync_len = hslen; var->left_margin = left; var->xres_virtual = vxres; var->xoffset = xoffset; xres >>= shift; right >>= shift; hslen >>= shift; left >>= shift; vxres >>= shift; xtotal = xres + right + hslen + left; if (xtotal >= 256) FAIL("xtotal too big"); if (hslen > 32) FAIL("hslen too big"); if (right + hslen + left > 64) FAIL("hblank too big"); par->crtc[VGA_CRTC_H_TOTAL] = xtotal - 5; par->crtc[VGA_CRTC_H_BLANK_START] = xres - 1; par->crtc[VGA_CRTC_H_DISP] = xres - 1; pos = xres + right; par->crtc[VGA_CRTC_H_SYNC_START] = pos; pos += hslen; par->crtc[VGA_CRTC_H_SYNC_END] = pos & 0x1F; pos += left - 2; /* blank_end + 2 <= total + 5 */ par->crtc[VGA_CRTC_H_BLANK_END] = (pos & 0x1F) | 0x80; if (pos & 0x20) par->crtc[VGA_CRTC_H_SYNC_END] |= 0x80; yres = var->yres; lower = var->lower_margin; vslen = var->vsync_len; upper = var->upper_margin; vyres = var->yres_virtual; yoffset = var->yoffset; if (yres > vyres) vyres = yres; if (vxres * vyres > maxmem) { vyres = maxmem / vxres; if (vyres < yres) return -ENOMEM; } if (yoffset + yres > vyres) yoffset = vyres - yres; var->yres = yres; var->lower_margin = lower; var->vsync_len = vslen; var->upper_margin = upper; var->yres_virtual = vyres; var->yoffset = yoffset; if (var->vmode & FB_VMODE_DOUBLE) { yres <<= 1; lower <<= 1; vslen <<= 1; upper <<= 1; } ytotal = yres + lower + vslen + upper; if (ytotal > 1024) { ytotal >>= 1; yres >>= 1; lower >>= 1; vslen >>= 1; upper >>= 1; rMode = 0x04; } else rMode = 0x00; if (ytotal > 1024) FAIL("ytotal too big"); if (vslen > 16) FAIL("vslen too big"); par->crtc[VGA_CRTC_V_TOTAL] = ytotal - 2; r7 = 0x10; /* disable linecompare */ if (ytotal & 0x100) r7 |= 0x01; if (ytotal & 0x200) r7 |= 0x20; par->crtc[VGA_CRTC_PRESET_ROW] = 0; par->crtc[VGA_CRTC_MAX_SCAN] = 0x40; /* 1 scanline, no linecmp */ if (var->vmode & FB_VMODE_DOUBLE) par->crtc[VGA_CRTC_MAX_SCAN] |= 0x80; par->crtc[VGA_CRTC_CURSOR_START] = 0x20; par->crtc[VGA_CRTC_CURSOR_END] = 0x00; if ((mode & (MODE_CFB | MODE_8BPP)) == MODE_CFB) xoffset--; pos = yoffset * vxres + (xoffset >> shift); par->crtc[VGA_CRTC_START_HI] = pos >> 8; par->crtc[VGA_CRTC_START_LO] = pos & 0xFF; par->crtc[VGA_CRTC_CURSOR_HI] = 0x00; par->crtc[VGA_CRTC_CURSOR_LO] = 0x00; pos = yres - 1; par->crtc[VGA_CRTC_V_DISP_END] = pos & 0xFF; par->crtc[VGA_CRTC_V_BLANK_START] = pos & 0xFF; if (pos & 0x100) r7 |= 0x0A; /* 0x02 -> DISP_END, 0x08 -> BLANK_START */ if (pos & 0x200) { r7 |= 0x40; /* 0x40 -> DISP_END */ par->crtc[VGA_CRTC_MAX_SCAN] |= 0x20; /* BLANK_START */ } pos += lower; par->crtc[VGA_CRTC_V_SYNC_START] = pos & 0xFF; if (pos & 0x100) r7 |= 0x04; if (pos & 0x200) r7 |= 0x80; pos += vslen; par->crtc[VGA_CRTC_V_SYNC_END] = (pos & 0x0F) & ~0x10; /* disabled IRQ */ pos += upper - 1; /* blank_end + 1 <= ytotal + 2 */ par->crtc[VGA_CRTC_V_BLANK_END] = pos & 0xFF; /* 0x7F for original VGA, but some SVGA chips requires all 8 bits to set */ if (vxres >= 512) FAIL("vxres too long"); par->crtc[VGA_CRTC_OFFSET] = vxres >> 1; if (mode & MODE_SKIP4) par->crtc[VGA_CRTC_UNDERLINE] = 0x5F; /* 256, cfb8 */ else par->crtc[VGA_CRTC_UNDERLINE] = 0x1F; /* 16, vgap */ par->crtc[VGA_CRTC_MODE] = rMode | ((mode & MODE_TEXT) ? 0xA3 : 0xE3); par->crtc[VGA_CRTC_LINE_COMPARE] = 0xFF; par->crtc[VGA_CRTC_OVERFLOW] = r7; par->vss = 0x00; /* 3DA */ par->misc = 0xE3; /* enable CPU, ports 0x3Dx, positive sync */ if (var->sync & FB_SYNC_HOR_HIGH_ACT) par->misc &= ~0x40; if (var->sync & FB_SYNC_VERT_HIGH_ACT) par->misc &= ~0x80; par->mode = mode; if (mode & MODE_8BPP) /* pixel clock == vga clock / 2 */ vga16fb_clock_chip(par, &var->pixclock, info, 1, 2); else /* pixel clock == vga clock */ vga16fb_clock_chip(par, &var->pixclock, info, 1, 1); var->red.offset = var->green.offset = var->blue.offset = var->transp.offset = 0; var->red.length = var->green.length = var->blue.length = (par->isVGA) ? 6 : 2; var->transp.length = 0; var->activate = FB_ACTIVATE_NOW; var->height = -1; var->width = -1; var->accel_flags = 0; return 0; } #undef FAIL static int vga16fb_set_par(struct fb_info *info) { struct vga16fb_par *par = info->par; u8 gdc[VGA_GFX_C]; u8 seq[VGA_SEQ_C]; u8 atc[VGA_ATT_C]; int fh, i; seq[VGA_SEQ_CLOCK_MODE] = 0x01 | par->clkdiv; if (par->mode & MODE_TEXT) seq[VGA_SEQ_PLANE_WRITE] = 0x03; else seq[VGA_SEQ_PLANE_WRITE] = 0x0F; seq[VGA_SEQ_CHARACTER_MAP] = 0x00; if (par->mode & MODE_TEXT) seq[VGA_SEQ_MEMORY_MODE] = 0x03; else if (par->mode & MODE_SKIP4) seq[VGA_SEQ_MEMORY_MODE] = 0x0E; else seq[VGA_SEQ_MEMORY_MODE] = 0x06; gdc[VGA_GFX_SR_VALUE] = 0x00; gdc[VGA_GFX_SR_ENABLE] = 0x00; gdc[VGA_GFX_COMPARE_VALUE] = 0x00; gdc[VGA_GFX_DATA_ROTATE] = 0x00; gdc[VGA_GFX_PLANE_READ] = 0; if (par->mode & MODE_TEXT) { gdc[VGA_GFX_MODE] = 0x10; gdc[VGA_GFX_MISC] = 0x06; } else { if (par->mode & MODE_CFB) gdc[VGA_GFX_MODE] = 0x40; else gdc[VGA_GFX_MODE] = 0x00; gdc[VGA_GFX_MISC] = 0x05; } gdc[VGA_GFX_COMPARE_MASK] = 0x0F; gdc[VGA_GFX_BIT_MASK] = 0xFF; for (i = 0x00; i < 0x10; i++) atc[i] = i; if (par->mode & MODE_TEXT) atc[VGA_ATC_MODE] = 0x04; else if (par->mode & MODE_8BPP) atc[VGA_ATC_MODE] = 0x41; else atc[VGA_ATC_MODE] = 0x81; atc[VGA_ATC_OVERSCAN] = 0x00; /* 0 for EGA, 0xFF for VGA */ atc[VGA_ATC_PLANE_ENABLE] = 0x0F; if (par->mode & MODE_8BPP) atc[VGA_ATC_PEL] = (info->var.xoffset & 3) << 1; else atc[VGA_ATC_PEL] = info->var.xoffset & 7; atc[VGA_ATC_COLOR_PAGE] = 0x00; if (par->mode & MODE_TEXT) { fh = 16; // FIXME !!! Fudge font height. par->crtc[VGA_CRTC_MAX_SCAN] = (par->crtc[VGA_CRTC_MAX_SCAN] & ~0x1F) | (fh - 1); } vga_io_w(VGA_MIS_W, vga_io_r(VGA_MIS_R) | 0x01); /* Enable graphics register modification */ if (!par->isVGA) { vga_io_w(EGA_GFX_E0, 0x00); vga_io_w(EGA_GFX_E1, 0x01); } /* update misc output register */ vga_io_w(VGA_MIS_W, par->misc); /* synchronous reset on */ vga_io_wseq(0x00, 0x01); if (par->isVGA) vga_io_w(VGA_PEL_MSK, par->pel_msk); /* write sequencer registers */ vga_io_wseq(VGA_SEQ_CLOCK_MODE, seq[VGA_SEQ_CLOCK_MODE] | 0x20); for (i = 2; i < VGA_SEQ_C; i++) { vga_io_wseq(i, seq[i]); } /* synchronous reset off */ vga_io_wseq(0x00, 0x03); /* deprotect CRT registers 0-7 */ vga_io_wcrt(VGA_CRTC_V_SYNC_END, par->crtc[VGA_CRTC_V_SYNC_END]); /* write CRT registers */ for (i = 0; i < VGA_CRTC_REGS; i++) { vga_io_wcrt(i, par->crtc[i]); } /* write graphics controller registers */ for (i = 0; i < VGA_GFX_C; i++) { vga_io_wgfx(i, gdc[i]); } /* write attribute controller registers */ for (i = 0; i < VGA_ATT_C; i++) { vga_io_r(VGA_IS1_RC); /* reset flip-flop */ vga_io_wattr(i, atc[i]); } /* Wait for screen to stabilize. */ mdelay(50); vga_io_wseq(VGA_SEQ_CLOCK_MODE, seq[VGA_SEQ_CLOCK_MODE]); vga_io_r(VGA_IS1_RC); vga_io_w(VGA_ATT_IW, 0x20); vga16fb_update_fix(info); return 0; } static void ega16_setpalette(int regno, unsigned red, unsigned green, unsigned blue) { static const unsigned char map[] = { 000, 001, 010, 011 }; int val; if (regno >= 16) return; val = map[red>>14] | ((map[green>>14]) << 1) | ((map[blue>>14]) << 2); vga_io_r(VGA_IS1_RC); /* ! 0x3BA */ vga_io_wattr(regno, val); vga_io_r(VGA_IS1_RC); /* some clones need it */ vga_io_w(VGA_ATT_IW, 0x20); /* unblank screen */ } static void vga16_setpalette(int regno, unsigned red, unsigned green, unsigned blue) { outb(regno, VGA_PEL_IW); outb(red >> 10, VGA_PEL_D); outb(green >> 10, VGA_PEL_D); outb(blue >> 10, VGA_PEL_D); } static int vga16fb_setcolreg(unsigned regno, unsigned red, unsigned green, unsigned blue, unsigned transp, struct fb_info *info) { struct vga16fb_par *par = info->par; int gray; /* * Set a single color register. The values supplied are * already rounded down to the hardware's capabilities * (according to the entries in the `var' structure). Return * != 0 for invalid regno. */ if (regno >= 256) return 1; gray = info->var.grayscale; if (gray) { /* gray = 0.30*R + 0.59*G + 0.11*B */ red = green = blue = (red * 77 + green * 151 + blue * 28) >> 8; } if (par->isVGA) vga16_setpalette(regno,red,green,blue); else ega16_setpalette(regno,red,green,blue); return 0; } static int vga16fb_pan_display(struct fb_var_screeninfo *var, struct fb_info *info) { vga16fb_pan_var(info, var); return 0; } /* The following VESA blanking code is taken from vgacon.c. The VGA blanking code was originally by Huang shi chao, and modified by Christoph Rimek (chrimek@toppoint.de) and todd j. derr (tjd@barefoot.org) for Linux. */ static void vga_vesa_blank(struct vga16fb_par *par, int mode) { unsigned char SeqCtrlIndex = vga_io_r(VGA_SEQ_I); unsigned char CrtCtrlIndex = vga_io_r(VGA_CRT_IC); /* save original values of VGA controller registers */ if(!par->vesa_blanked) { par->vga_state.CrtMiscIO = vga_io_r(VGA_MIS_R); //sti(); par->vga_state.HorizontalTotal = vga_io_rcrt(0x00); /* HorizontalTotal */ par->vga_state.HorizDisplayEnd = vga_io_rcrt(0x01); /* HorizDisplayEnd */ par->vga_state.StartHorizRetrace = vga_io_rcrt(0x04); /* StartHorizRetrace */ par->vga_state.EndHorizRetrace = vga_io_rcrt(0x05); /* EndHorizRetrace */ par->vga_state.Overflow = vga_io_rcrt(0x07); /* Overflow */ par->vga_state.StartVertRetrace = vga_io_rcrt(0x10); /* StartVertRetrace */ par->vga_state.EndVertRetrace = vga_io_rcrt(0x11); /* EndVertRetrace */ par->vga_state.ModeControl = vga_io_rcrt(0x17); /* ModeControl */ par->vga_state.ClockingMode = vga_io_rseq(0x01); /* ClockingMode */ } /* assure that video is enabled */ /* "0x20" is VIDEO_ENABLE_bit in register 01 of sequencer */ vga_io_wseq(0x01, par->vga_state.ClockingMode | 0x20); /* test for vertical retrace in process.... */ if ((par->vga_state.CrtMiscIO & 0x80) == 0x80) vga_io_w(VGA_MIS_W, par->vga_state.CrtMiscIO & 0xef); /* * Set <End of vertical retrace> to minimum (0) and * <Start of vertical Retrace> to maximum (incl. overflow) * Result: turn off vertical sync (VSync) pulse. */ if (mode & FB_BLANK_VSYNC_SUSPEND) { vga_io_wcrt(VGA_CRTC_V_SYNC_START, 0xff); vga_io_wcrt(VGA_CRTC_V_SYNC_END, 0x40); /* bits 9,10 of vert. retrace */ vga_io_wcrt(VGA_CRTC_OVERFLOW, par->vga_state.Overflow | 0x84); } if (mode & FB_BLANK_HSYNC_SUSPEND) { /* * Set <End of horizontal retrace> to minimum (0) and * <Start of horizontal Retrace> to maximum * Result: turn off horizontal sync (HSync) pulse. */ vga_io_wcrt(VGA_CRTC_H_SYNC_START, 0xff); vga_io_wcrt(VGA_CRTC_H_SYNC_END, 0x00); } /* restore both index registers */ outb_p(SeqCtrlIndex, VGA_SEQ_I); outb_p(CrtCtrlIndex, VGA_CRT_IC); } static void vga_vesa_unblank(struct vga16fb_par *par) { unsigned char SeqCtrlIndex = vga_io_r(VGA_SEQ_I); unsigned char CrtCtrlIndex = vga_io_r(VGA_CRT_IC); /* restore original values of VGA controller registers */ vga_io_w(VGA_MIS_W, par->vga_state.CrtMiscIO); /* HorizontalTotal */ vga_io_wcrt(0x00, par->vga_state.HorizontalTotal); /* HorizDisplayEnd */ vga_io_wcrt(0x01, par->vga_state.HorizDisplayEnd); /* StartHorizRetrace */ vga_io_wcrt(0x04, par->vga_state.StartHorizRetrace); /* EndHorizRetrace */ vga_io_wcrt(0x05, par->vga_state.EndHorizRetrace); /* Overflow */ vga_io_wcrt(0x07, par->vga_state.Overflow); /* StartVertRetrace */ vga_io_wcrt(0x10, par->vga_state.StartVertRetrace); /* EndVertRetrace */ vga_io_wcrt(0x11, par->vga_state.EndVertRetrace); /* ModeControl */ vga_io_wcrt(0x17, par->vga_state.ModeControl); /* ClockingMode */ vga_io_wseq(0x01, par->vga_state.ClockingMode); /* restore index/control registers */ vga_io_w(VGA_SEQ_I, SeqCtrlIndex); vga_io_w(VGA_CRT_IC, CrtCtrlIndex); } static void vga_pal_blank(void) { int i; for (i=0; i<16; i++) { outb_p(i, VGA_PEL_IW); outb_p(0, VGA_PEL_D); outb_p(0, VGA_PEL_D); outb_p(0, VGA_PEL_D); } } /* 0 unblank, 1 blank, 2 no vsync, 3 no hsync, 4 off */ static int vga16fb_blank(int blank, struct fb_info *info) { struct vga16fb_par *par = info->par; switch (blank) { case FB_BLANK_UNBLANK: /* Unblank */ if (par->vesa_blanked) { vga_vesa_unblank(par); par->vesa_blanked = 0; } if (par->palette_blanked) { par->palette_blanked = 0; } break; case FB_BLANK_NORMAL: /* blank */ vga_pal_blank(); par->palette_blanked = 1; break; default: /* VESA blanking */ vga_vesa_blank(par, blank); par->vesa_blanked = 1; break; } return 0; } static void vga_8planes_fillrect(struct fb_info *info, const struct fb_fillrect *rect) { u32 dx = rect->dx, width = rect->width; char oldindex = getindex(); char oldmode = setmode(0x40); char oldmask = selectmask(); int line_ofs, height; char oldop, oldsr; char __iomem *where; dx /= 4; where = info->screen_base + dx + rect->dy * info->fix.line_length; if (rect->rop == ROP_COPY) { oldop = setop(0); oldsr = setsr(0); width /= 4; line_ofs = info->fix.line_length - width; setmask(0xff); height = rect->height; while (height--) { int x; /* we can do memset... */ for (x = width; x > 0; --x) { writeb(rect->color, where); where++; } where += line_ofs; } } else { char oldcolor = setcolor(0xf); int y; oldop = setop(0x18); oldsr = setsr(0xf); setmask(0x0F); for (y = 0; y < rect->height; y++) { rmw(where); rmw(where+1); where += info->fix.line_length; } setcolor(oldcolor); } setmask(oldmask); setsr(oldsr); setop(oldop); setmode(oldmode); setindex(oldindex); } static void vga16fb_fillrect(struct fb_info *info, const struct fb_fillrect *rect) { int x, x2, y2, vxres, vyres, width, height, line_ofs; char __iomem *dst; vxres = info->var.xres_virtual; vyres = info->var.yres_virtual; if (!rect->width || !rect->height || rect->dx > vxres || rect->dy > vyres) return; /* We could use hardware clipping but on many cards you get around * hardware clipping by writing to framebuffer directly. */ x2 = rect->dx + rect->width; y2 = rect->dy + rect->height; x2 = x2 < vxres ? x2 : vxres; y2 = y2 < vyres ? y2 : vyres; width = x2 - rect->dx; switch (info->fix.type) { case FB_TYPE_VGA_PLANES: if (info->fix.type_aux == FB_AUX_VGA_PLANES_VGA4) { height = y2 - rect->dy; width = rect->width/8; line_ofs = info->fix.line_length - width; dst = info->screen_base + (rect->dx/8) + rect->dy * info->fix.line_length; switch (rect->rop) { case ROP_COPY: setmode(0); setop(0); setsr(0xf); setcolor(rect->color); selectmask(); setmask(0xff); while (height--) { for (x = 0; x < width; x++) { writeb(0, dst); dst++; } dst += line_ofs; } break; case ROP_XOR: setmode(0); setop(0x18); setsr(0xf); setcolor(0xf); selectmask(); setmask(0xff); while (height--) { for (x = 0; x < width; x++) { rmw(dst); dst++; } dst += line_ofs; } break; } } else vga_8planes_fillrect(info, rect); break; case FB_TYPE_PACKED_PIXELS: default: cfb_fillrect(info, rect); break; } } static void vga_8planes_copyarea(struct fb_info *info, const struct fb_copyarea *area) { char oldindex = getindex(); char oldmode = setmode(0x41); char oldop = setop(0); char oldsr = setsr(0xf); int height, line_ofs, x; u32 sx, dx, width; char __iomem *dest; char __iomem *src; height = area->height; sx = area->sx / 4; dx = area->dx / 4; width = area->width / 4; if (area->dy < area->sy || (area->dy == area->sy && dx < sx)) { line_ofs = info->fix.line_length - width; dest = info->screen_base + dx + area->dy * info->fix.line_length; src = info->screen_base + sx + area->sy * info->fix.line_length; while (height--) { for (x = 0; x < width; x++) { readb(src); writeb(0, dest); src++; dest++; } src += line_ofs; dest += line_ofs; } } else { line_ofs = info->fix.line_length - width; dest = info->screen_base + dx + width + (area->dy + height - 1) * info->fix.line_length; src = info->screen_base + sx + width + (area->sy + height - 1) * info->fix.line_length; while (height--) { for (x = 0; x < width; x++) { --src; --dest; readb(src); writeb(0, dest); } src -= line_ofs; dest -= line_ofs; } } setsr(oldsr); setop(oldop); setmode(oldmode); setindex(oldindex); } static void vga16fb_copyarea(struct fb_info *info, const struct fb_copyarea *area) { u32 dx = area->dx, dy = area->dy, sx = area->sx, sy = area->sy; int x, x2, y2, old_dx, old_dy, vxres, vyres; int height, width, line_ofs; char __iomem *dst = NULL; char __iomem *src = NULL; vxres = info->var.xres_virtual; vyres = info->var.yres_virtual; if (area->dx > vxres || area->sx > vxres || area->dy > vyres || area->sy > vyres) return; /* clip the destination */ old_dx = area->dx; old_dy = area->dy; /* * We could use hardware clipping but on many cards you get around * hardware clipping by writing to framebuffer directly. */ x2 = area->dx + area->width; y2 = area->dy + area->height; dx = area->dx > 0 ? area->dx : 0; dy = area->dy > 0 ? area->dy : 0; x2 = x2 < vxres ? x2 : vxres; y2 = y2 < vyres ? y2 : vyres; width = x2 - dx; height = y2 - dy; if (sx + dx < old_dx || sy + dy < old_dy) return; /* update sx1,sy1 */ sx += (dx - old_dx); sy += (dy - old_dy); /* the source must be completely inside the virtual screen */ if (sx + width > vxres || sy + height > vyres) return; switch (info->fix.type) { case FB_TYPE_VGA_PLANES: if (info->fix.type_aux == FB_AUX_VGA_PLANES_VGA4) { width = width/8; height = height; line_ofs = info->fix.line_length - width; setmode(1); setop(0); setsr(0xf); if (dy < sy || (dy == sy && dx < sx)) { dst = info->screen_base + (dx/8) + dy * info->fix.line_length; src = info->screen_base + (sx/8) + sy * info->fix.line_length; while (height--) { for (x = 0; x < width; x++) { readb(src); writeb(0, dst); dst++; src++; } src += line_ofs; dst += line_ofs; } } else { dst = info->screen_base + (dx/8) + width + (dy + height - 1) * info->fix.line_length; src = info->screen_base + (sx/8) + width + (sy + height - 1) * info->fix.line_length; while (height--) { for (x = 0; x < width; x++) { dst--; src--; readb(src); writeb(0, dst); } src -= line_ofs; dst -= line_ofs; } } } else vga_8planes_copyarea(info, area); break; case FB_TYPE_PACKED_PIXELS: default: cfb_copyarea(info, area); break; } } #define TRANS_MASK_LOW {0x0,0x8,0x4,0xC,0x2,0xA,0x6,0xE,0x1,0x9,0x5,0xD,0x3,0xB,0x7,0xF} #define TRANS_MASK_HIGH {0x000, 0x800, 0x400, 0xC00, 0x200, 0xA00, 0x600, 0xE00, \ 0x100, 0x900, 0x500, 0xD00, 0x300, 0xB00, 0x700, 0xF00} #if defined(__LITTLE_ENDIAN) static const u16 transl_l[] = TRANS_MASK_LOW; static const u16 transl_h[] = TRANS_MASK_HIGH; #elif defined(__BIG_ENDIAN) static const u16 transl_l[] = TRANS_MASK_HIGH; static const u16 transl_h[] = TRANS_MASK_LOW; #else #error "Only __BIG_ENDIAN and __LITTLE_ENDIAN are supported in vga-planes" #endif static void vga_8planes_imageblit(struct fb_info *info, const struct fb_image *image) { char oldindex = getindex(); char oldmode = setmode(0x40); char oldop = setop(0); char oldsr = setsr(0); char oldmask = selectmask(); const unsigned char *cdat = image->data; u32 dx = image->dx; char __iomem *where; int y; dx /= 4; where = info->screen_base + dx + image->dy * info->fix.line_length; setmask(0xff); writeb(image->bg_color, where); readb(where); selectmask(); setmask(image->fg_color ^ image->bg_color); setmode(0x42); setop(0x18); for (y = 0; y < image->height; y++, where += info->fix.line_length) writew(transl_h[cdat[y]&0xF] | transl_l[cdat[y] >> 4], where); setmask(oldmask); setsr(oldsr); setop(oldop); setmode(oldmode); setindex(oldindex); } static void vga_imageblit_expand(struct fb_info *info, const struct fb_image *image) { char __iomem *where = info->screen_base + (image->dx/8) + image->dy * info->fix.line_length; struct vga16fb_par *par = info->par; char *cdat = (char *) image->data; char __iomem *dst; int x, y; switch (info->fix.type) { case FB_TYPE_VGA_PLANES: if (info->fix.type_aux == FB_AUX_VGA_PLANES_VGA4) { if (par->isVGA) { setmode(2); setop(0); setsr(0xf); setcolor(image->fg_color); selectmask(); setmask(0xff); writeb(image->bg_color, where); rmb(); readb(where); /* fill latches */ setmode(3); wmb(); for (y = 0; y < image->height; y++) { dst = where; for (x = image->width/8; x--;) writeb(*cdat++, dst++); where += info->fix.line_length; } wmb(); } else { setmode(0); setop(0); setsr(0xf); setcolor(image->bg_color); selectmask(); setmask(0xff); for (y = 0; y < image->height; y++) { dst = where; for (x=image->width/8; x--;){ rmw(dst); setcolor(image->fg_color); selectmask(); if (*cdat) { setmask(*cdat++); rmw(dst++); } } where += info->fix.line_length; } } } else vga_8planes_imageblit(info, image); break; case FB_TYPE_PACKED_PIXELS: default: cfb_imageblit(info, image); break; } } static void vga_imageblit_color(struct fb_info *info, const struct fb_image *image) { /* * Draw logo */ struct vga16fb_par *par = info->par; char __iomem *where = info->screen_base + image->dy * info->fix.line_length + image->dx/8; const char *cdat = image->data; char __iomem *dst; int x, y; switch (info->fix.type) { case FB_TYPE_VGA_PLANES: if (info->fix.type_aux == FB_AUX_VGA_PLANES_VGA4 && par->isVGA) { setsr(0xf); setop(0); setmode(0); for (y = 0; y < image->height; y++) { for (x = 0; x < image->width; x++) { dst = where + x/8; setcolor(*cdat); selectmask(); setmask(1 << (7 - (x % 8))); fb_readb(dst); fb_writeb(0, dst); cdat++; } where += info->fix.line_length; } } break; case FB_TYPE_PACKED_PIXELS: cfb_imageblit(info, image); break; default: break; } } static void vga16fb_imageblit(struct fb_info *info, const struct fb_image *image) { if (image->depth == 1) vga_imageblit_expand(info, image); else vga_imageblit_color(info, image); } static void vga16fb_destroy(struct fb_info *info) { iounmap(info->screen_base); fb_dealloc_cmap(&info->cmap); /* XXX unshare VGA regions */ framebuffer_release(info); } static struct fb_ops vga16fb_ops = { .owner = THIS_MODULE, .fb_open = vga16fb_open, .fb_release = vga16fb_release, .fb_destroy = vga16fb_destroy, .fb_check_var = vga16fb_check_var, .fb_set_par = vga16fb_set_par, .fb_setcolreg = vga16fb_setcolreg, .fb_pan_display = vga16fb_pan_display, .fb_blank = vga16fb_blank, .fb_fillrect = vga16fb_fillrect, .fb_copyarea = vga16fb_copyarea, .fb_imageblit = vga16fb_imageblit, }; #ifndef MODULE static int __init vga16fb_setup(char *options) { char *this_opt; if (!options || !*options) return 0; while ((this_opt = strsep(&options, ",")) != NULL) { if (!*this_opt) continue; } return 0; } #endif static int vga16fb_probe(struct platform_device *dev) { struct fb_info *info; struct vga16fb_par *par; int i; int ret = 0; printk(KERN_DEBUG "vga16fb: initializing\n"); info = framebuffer_alloc(sizeof(struct vga16fb_par), &dev->dev); if (!info) { ret = -ENOMEM; goto err_fb_alloc; } info->apertures = alloc_apertures(1); if (!info->apertures) { ret = -ENOMEM; goto err_ioremap; } /* XXX share VGA_FB_PHYS and I/O region with vgacon and others */ info->screen_base = (void __iomem *)VGA_MAP_MEM(VGA_FB_PHYS, 0); if (!info->screen_base) { printk(KERN_ERR "vga16fb: unable to map device\n"); ret = -ENOMEM; goto err_ioremap; } printk(KERN_INFO "vga16fb: mapped to 0x%p\n", info->screen_base); par = info->par; par->isVGA = screen_info.orig_video_isVGA; par->palette_blanked = 0; par->vesa_blanked = 0; i = par->isVGA? 6 : 2; vga16fb_defined.red.length = i; vga16fb_defined.green.length = i; vga16fb_defined.blue.length = i; /* name should not depend on EGA/VGA */ info->fbops = &vga16fb_ops; info->var = vga16fb_defined; info->fix = vga16fb_fix; /* supports rectangles with widths of multiples of 8 */ info->pixmap.blit_x = 1 << 7 | 1 << 15 | 1 << 23 | 1 << 31; info->flags = FBINFO_FLAG_DEFAULT | FBINFO_MISC_FIRMWARE | FBINFO_HWACCEL_YPAN; i = (info->var.bits_per_pixel == 8) ? 256 : 16; ret = fb_alloc_cmap(&info->cmap, i, 0); if (ret) { printk(KERN_ERR "vga16fb: unable to allocate colormap\n"); ret = -ENOMEM; goto err_alloc_cmap; } if (vga16fb_check_var(&info->var, info)) { printk(KERN_ERR "vga16fb: unable to validate variable\n"); ret = -EINVAL; goto err_check_var; } vga16fb_update_fix(info); info->apertures->ranges[0].base = VGA_FB_PHYS; info->apertures->ranges[0].size = VGA_FB_PHYS_LEN; if (register_framebuffer(info) < 0) { printk(KERN_ERR "vga16fb: unable to register framebuffer\n"); ret = -EINVAL; goto err_check_var; } fb_info(info, "%s frame buffer device\n", info->fix.id); platform_set_drvdata(dev, info); return 0; err_check_var: fb_dealloc_cmap(&info->cmap); err_alloc_cmap: iounmap(info->screen_base); err_ioremap: framebuffer_release(info); err_fb_alloc: return ret; } static int vga16fb_remove(struct platform_device *dev) { struct fb_info *info = platform_get_drvdata(dev); if (info) unregister_framebuffer(info); return 0; } static struct platform_driver vga16fb_driver = { .probe = vga16fb_probe, .remove = vga16fb_remove, .driver = { .name = "vga16fb", }, }; static struct platform_device *vga16fb_device; static int __init vga16fb_init(void) { int ret; #ifndef MODULE char *option = NULL; if (fb_get_options("vga16fb", &option)) return -ENODEV; vga16fb_setup(option); #endif ret = platform_driver_register(&vga16fb_driver); if (!ret) { vga16fb_device = platform_device_alloc("vga16fb", 0); if (vga16fb_device) ret = platform_device_add(vga16fb_device); else ret = -ENOMEM; if (ret) { platform_device_put(vga16fb_device); platform_driver_unregister(&vga16fb_driver); } } return ret; } static void __exit vga16fb_exit(void) { platform_device_unregister(vga16fb_device); platform_driver_unregister(&vga16fb_driver); } MODULE_DESCRIPTION("Legacy VGA framebuffer device driver"); MODULE_LICENSE("GPL"); module_init(vga16fb_init); module_exit(vga16fb_exit); /* * Overrides for Emacs so that we follow Linus's tabbing style. * --------------------------------------------------------------------------- * Local variables: * c-basic-offset: 8 * End: */
49 49 49 52 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 /* * Copyright (c) International Business Machines Corp., 2006 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Author: Artem Bityutskiy (Битюцкий Артём) */ /* Here we keep miscellaneous functions which are used all over the UBI code */ #include "ubi.h" /** * calc_data_len - calculate how much real data is stored in a buffer. * @ubi: UBI device description object * @buf: a buffer with the contents of the physical eraseblock * @length: the buffer length * * This function calculates how much "real data" is stored in @buf and returnes * the length. Continuous 0xFF bytes at the end of the buffer are not * considered as "real data". */ int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf, int length) { int i; ubi_assert(!(length & (ubi->min_io_size - 1))); for (i = length - 1; i >= 0; i--) if (((const uint8_t *)buf)[i] != 0xFF) break; /* The resulting length must be aligned to the minimum flash I/O size */ length = ALIGN(i + 1, ubi->min_io_size); return length; } /** * ubi_check_volume - check the contents of a static volume. * @ubi: UBI device description object * @vol_id: ID of the volume to check * * This function checks if static volume @vol_id is corrupted by fully reading * it and checking data CRC. This function returns %0 if the volume is not * corrupted, %1 if it is corrupted and a negative error code in case of * failure. Dynamic volumes are not checked and zero is returned immediately. */ int ubi_check_volume(struct ubi_device *ubi, int vol_id) { void *buf; int err = 0, i; struct ubi_volume *vol = ubi->volumes[vol_id]; if (vol->vol_type != UBI_STATIC_VOLUME) return 0; buf = vmalloc(vol->usable_leb_size); if (!buf) return -ENOMEM; for (i = 0; i < vol->used_ebs; i++) { int size; cond_resched(); if (i == vol->used_ebs - 1) size = vol->last_eb_bytes; else size = vol->usable_leb_size; err = ubi_eba_read_leb(ubi, vol, i, buf, 0, size, 1); if (err) { if (mtd_is_eccerr(err)) err = 1; break; } } vfree(buf); return err; } /** * ubi_update_reserved - update bad eraseblock handling accounting data. * @ubi: UBI device description object * * This function calculates the gap between current number of PEBs reserved for * bad eraseblock handling and the required level of PEBs that must be * reserved, and if necessary, reserves more PEBs to fill that gap, according * to availability. Should be called with ubi->volumes_lock held. */ void ubi_update_reserved(struct ubi_device *ubi) { int need = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs; if (need <= 0 || ubi->avail_pebs == 0) return; need = min_t(int, need, ubi->avail_pebs); ubi->avail_pebs -= need; ubi->rsvd_pebs += need; ubi->beb_rsvd_pebs += need; ubi_msg(ubi, "reserved more %d PEBs for bad PEB handling", need); } /** * ubi_calculate_reserved - calculate how many PEBs must be reserved for bad * eraseblock handling. * @ubi: UBI device description object */ void ubi_calculate_reserved(struct ubi_device *ubi) { /* * Calculate the actual number of PEBs currently needed to be reserved * for future bad eraseblock handling. */ ubi->beb_rsvd_level = ubi->bad_peb_limit - ubi->bad_peb_count; if (ubi->beb_rsvd_level < 0) { ubi->beb_rsvd_level = 0; ubi_warn(ubi, "number of bad PEBs (%d) is above the expected limit (%d), not reserving any PEBs for bad PEB handling, will use available PEBs (if any)", ubi->bad_peb_count, ubi->bad_peb_limit); } } /** * ubi_check_pattern - check if buffer contains only a certain byte pattern. * @buf: buffer to check * @patt: the pattern to check * @size: buffer size in bytes * * This function returns %1 in there are only @patt bytes in @buf, and %0 if * something else was also found. */ int ubi_check_pattern(const void *buf, uint8_t patt, int size) { int i; for (i = 0; i < size; i++) if (((const uint8_t *)buf)[i] != patt) return 0; return 1; } /* Normal UBI messages */ void ubi_msg(const struct ubi_device *ubi, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_notice(UBI_NAME_STR "%d: %pV\n", ubi->ubi_num, &vaf); va_end(args); } /* UBI warning messages */ void ubi_warn(const struct ubi_device *ubi, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_warn(UBI_NAME_STR "%d warning: %ps: %pV\n", ubi->ubi_num, __builtin_return_address(0), &vaf); va_end(args); } /* UBI error messages */ void ubi_err(const struct ubi_device *ubi, const char *fmt, ...) { struct va_format vaf; va_list args; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; pr_err(UBI_NAME_STR "%d error: %ps: %pV\n", ubi->ubi_num, __builtin_return_address(0), &vaf); va_end(args); }
21 1 1 1 34 34 34 34 34 34 33 33 33 33 33 33 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 /* * net/sched/sch_mq.c Classful multiqueue dummy scheduler * * Copyright (c) 2009 Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <net/netlink.h> #include <net/pkt_sched.h> struct mq_sched { struct Qdisc **qdiscs; }; static void mq_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); unsigned int ntx; if (!priv->qdiscs) return; for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) qdisc_destroy(priv->qdiscs[ntx]); kfree(priv->qdiscs); } static int mq_init(struct Qdisc *sch, struct nlattr *opt) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); struct netdev_queue *dev_queue; struct Qdisc *qdisc; unsigned int ntx; if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; /* pre-allocate qdiscs, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); if (!priv->qdiscs) return -ENOMEM; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); if (!qdisc) return -ENOMEM; priv->qdiscs[ntx] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } sch->flags |= TCQ_F_MQROOT; return 0; } static void mq_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); struct Qdisc *qdisc, *old; unsigned int ntx; for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) qdisc_destroy(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); #endif } kfree(priv->qdiscs); priv->qdiscs = NULL; } static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; unsigned int ntx; sch->q.qlen = 0; memset(&sch->bstats, 0, sizeof(sch->bstats)); memset(&sch->qstats, 0, sizeof(sch->qstats)); for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; spin_lock_bh(qdisc_lock(qdisc)); sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; sch->qstats.overlimits += qdisc->qstats.overlimits; spin_unlock_bh(qdisc_lock(qdisc)); } return 0; } static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static struct netdev_queue *mq_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { unsigned int ntx = TC_H_MIN(tcm->tcm_parent); struct netdev_queue *dev_queue = mq_queue_get(sch, ntx); if (!dev_queue) { struct net_device *dev = qdisc_dev(sch); return netdev_get_tx_queue(dev, 0); } return dev_queue; } static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); struct net_device *dev = qdisc_dev(sch); if (dev->flags & IFF_UP) dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); return dev_queue->qdisc_sleeping; } static unsigned long mq_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!mq_queue_get(sch, ntx)) return 0; return ntx; } static int mq_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = dev_queue->qdisc_sleeping->handle; return 0; } static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; return 0; } static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned int ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (arg->fn(sch, ntx + 1, arg) < 0) { arg->stop = 1; break; } arg->count++; } } static const struct Qdisc_class_ops mq_class_ops = { .select_queue = mq_select_queue, .graft = mq_graft, .leaf = mq_leaf, .find = mq_find, .walk = mq_walk, .dump = mq_dump_class, .dump_stats = mq_dump_class_stats, }; struct Qdisc_ops mq_qdisc_ops __read_mostly = { .cl_ops = &mq_class_ops, .id = "mq", .priv_size = sizeof(struct mq_sched), .init = mq_init, .destroy = mq_destroy, .attach = mq_attach, .dump = mq_dump, .owner = THIS_MODULE, };
628 463 461 329 328 329 627 1 1 1 223 223 223 223 223 223 310 306 27 292 290 310 306 310 4 310 306 4 241 241 241 241 82 82 82 79 67 67 6 1 5 2 1 2 78 309 305 240 144 6 17 402 115 352 242 42 208 57 57 57 262 408 221 3 81 1 408 287 76 403 276 267 64 10 56 55 53 347 70 349 491 492 9 9 8 9 9 9 32 19 13 31 13 13 223 5 11 1 1 1 424 153 21 21 21 21 20 18 11 8 276 16 270 5 5 4 3 3 267 276 401 139 274 24 22 273 401 385 385 383 383 8 8 3 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 /* SCTP kernel implementation * (C) Copyright IBM Corp. 2002, 2004 * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * Copyright (c) 2002-2003 Intel Corp. * * This file is part of the SCTP kernel implementation * * SCTP over IPv6. * * This SCTP implementation is free software; * you can redistribute it and/or modify it under the terms of * the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This SCTP implementation is distributed in the hope that it * will be useful, but WITHOUT ANY WARRANTY; without even the implied * ************************ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU CC; see the file COPYING. If not, see * <http://www.gnu.org/licenses/>. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Le Yanqun <yanqun.le@nokia.com> * Hui Huang <hui.huang@nokia.com> * La Monte H.P. Yarroll <piggy@acm.org> * Sridhar Samudrala <sri@us.ibm.com> * Jon Grimm <jgrimm@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * * Based on: * linux/net/ipv6/tcp_ipv6.c */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/ipsec.h> #include <linux/slab.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/seq_file.h> #include <net/protocol.h> #include <net/ndisc.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/inet_common.h> #include <net/inet_ecn.h> #include <net/sctp/sctp.h> #include <linux/uaccess.h> static inline int sctp_v6_addr_match_len(union sctp_addr *s1, union sctp_addr *s2); static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, __be16 port); static int sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2); /* Event handler for inet6 address addition/deletion events. * The sctp_local_addr_list needs to be protocted by a spin lock since * multiple notifiers (say IPv4 and IPv6) may be running at the same * time and thus corrupt the list. * The reader side is protected with RCU. */ static int sctp_inet6addr_event(struct notifier_block *this, unsigned long ev, void *ptr) { struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sctp_sockaddr_entry *addr = NULL; struct sctp_sockaddr_entry *temp; struct net *net = dev_net(ifa->idev->dev); int found = 0; switch (ev) { case NETDEV_UP: addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifa->addr; addr->a.v6.sin6_scope_id = ifa->idev->dev->ifindex; addr->valid = 1; spin_lock_bh(&net->sctp.local_addr_lock); list_add_tail_rcu(&addr->list, &net->sctp.local_addr_list); sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_NEW); spin_unlock_bh(&net->sctp.local_addr_lock); } break; case NETDEV_DOWN: spin_lock_bh(&net->sctp.local_addr_lock); list_for_each_entry_safe(addr, temp, &net->sctp.local_addr_list, list) { if (addr->a.sa.sa_family == AF_INET6 && ipv6_addr_equal(&addr->a.v6.sin6_addr, &ifa->addr)) { sctp_addr_wq_mgmt(net, addr, SCTP_ADDR_DEL); found = 1; addr->valid = 0; list_del_rcu(&addr->list); break; } } spin_unlock_bh(&net->sctp.local_addr_lock); if (found) kfree_rcu(addr, rcu); break; } return NOTIFY_DONE; } static struct notifier_block sctp_inet6addr_notifier = { .notifier_call = sctp_inet6addr_event, }; /* ICMP error handler. */ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { struct inet6_dev *idev; struct sock *sk; struct sctp_association *asoc; struct sctp_transport *transport; struct ipv6_pinfo *np; __u16 saveip, savesctp; int err; struct net *net = dev_net(skb->dev); idev = in6_dev_get(skb->dev); /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, offset); sk = sctp_err_lookup(net, AF_INET6, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original pointers. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_INERRORS); goto out; } /* Warning: The sock lock is held. Remember to call * sctp_err_finish! */ switch (type) { case ICMPV6_PKT_TOOBIG: if (ip6_sk_accept_pmtu(sk)) sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info)); goto out_unlock; case ICMPV6_PARAMPROB: if (ICMPV6_UNK_NEXTHDR == code) { sctp_icmp_proto_unreachable(sk, asoc, transport); goto out_unlock; } break; case NDISC_REDIRECT: sctp_icmp_redirect(sk, transport, skb); goto out_unlock; default: break; } np = inet6_sk(sk); icmpv6_err_convert(type, code, &err); if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; sk->sk_error_report(sk); } else { /* Only an error on timeout */ sk->sk_err_soft = err; } out_unlock: sctp_err_finish(sk, transport); out: if (likely(idev != NULL)) in6_dev_put(idev); } static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport) { struct sock *sk = skb->sk; struct ipv6_pinfo *np = inet6_sk(sk); struct flowi6 *fl6 = &transport->fl.u.ip6; int res; pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb, skb->len, &fl6->saddr, &fl6->daddr); IP6_ECN_flow_xmit(sk, fl6->flowlabel); if (!(transport->param_flags & SPP_PMTUD_ENABLE)) skb->ignore_df = 1; SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS); rcu_read_lock(); res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), np->tclass); rcu_read_unlock(); return res; } /* Returns the dst cache entry for the given source and destination ip * addresses. */ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr, struct flowi *fl, struct sock *sk) { struct sctp_association *asoc = t->asoc; struct dst_entry *dst = NULL; struct flowi _fl; struct flowi6 *fl6 = &_fl.u.ip6; struct sctp_bind_addr *bp; struct ipv6_pinfo *np = inet6_sk(sk); struct sctp_sockaddr_entry *laddr; union sctp_addr *daddr = &t->ipaddr; union sctp_addr dst_saddr; struct in6_addr *final_p, final; enum sctp_scope scope; __u8 matchlen = 0; memset(&_fl, 0, sizeof(_fl)); fl6->daddr = daddr->v6.sin6_addr; fl6->fl6_dport = daddr->v6.sin6_port; fl6->flowi6_proto = IPPROTO_SCTP; if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) fl6->flowi6_oif = daddr->v6.sin6_scope_id; else if (asoc) fl6->flowi6_oif = asoc->base.sk->sk_bound_dev_if; pr_debug("%s: dst=%pI6 ", __func__, &fl6->daddr); if (asoc) fl6->fl6_sport = htons(asoc->base.bind_addr.port); if (saddr) { fl6->saddr = saddr->v6.sin6_addr; fl6->fl6_sport = saddr->v6.sin6_port; pr_debug("src=%pI6 - ", &fl6->saddr); } rcu_read_lock(); final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!asoc || saddr) { t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); goto out; } bp = &asoc->base.bind_addr; scope = sctp_scope(daddr); /* ip6_dst_lookup has filled in the fl6->saddr for us. Check * to see if we can use it. */ if (!IS_ERR(dst)) { /* Walk through the bind address list and look for a bind * address that matches the source address of the returned dst. */ sctp_v6_to_addr(&dst_saddr, &fl6->saddr, htons(bp->port)); rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { if (!laddr->valid || laddr->state == SCTP_ADDR_DEL || (laddr->state != SCTP_ADDR_SRC && !asoc->src_out_of_asoc_ok)) continue; /* Do not compare against v4 addrs */ if ((laddr->a.sa.sa_family == AF_INET6) && (sctp_v6_cmp_addr(&dst_saddr, &laddr->a))) { rcu_read_unlock(); t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); goto out; } } rcu_read_unlock(); /* None of the bound addresses match the source address of the * dst. So release it. */ dst_release(dst); dst = NULL; } /* Walk through the bind address list and try to get the * best source address for a given destination. */ rcu_read_lock(); list_for_each_entry_rcu(laddr, &bp->address_list, list) { struct dst_entry *bdst; __u8 bmatchlen; if (!laddr->valid || laddr->state != SCTP_ADDR_SRC || laddr->a.sa.sa_family != AF_INET6 || scope > sctp_scope(&laddr->a)) continue; fl6->saddr = laddr->a.v6.sin6_addr; fl6->fl6_sport = laddr->a.v6.sin6_port; final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); bdst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(bdst)) continue; if (ipv6_chk_addr(dev_net(bdst->dev), &laddr->a.v6.sin6_addr, bdst->dev, 1)) { if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); break; } bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); if (matchlen > bmatchlen) { dst_release(bdst); continue; } if (!IS_ERR_OR_NULL(dst)) dst_release(dst); dst = bdst; matchlen = bmatchlen; t->dst = dst; memcpy(fl, &_fl, sizeof(_fl)); } rcu_read_unlock(); out: if (!IS_ERR_OR_NULL(dst)) { struct rt6_info *rt; rt = (struct rt6_info *)dst; t->dst_cookie = rt6_get_cookie(rt); pr_debug("rt6_dst:%pI6/%d rt6_src:%pI6\n", &rt->rt6i_dst.addr, rt->rt6i_dst.plen, &fl->u.ip6.saddr); } else { t->dst = NULL; pr_debug("no route\n"); } } /* Returns the number of consecutive initial bits that match in the 2 ipv6 * addresses. */ static inline int sctp_v6_addr_match_len(union sctp_addr *s1, union sctp_addr *s2) { return ipv6_addr_diff(&s1->v6.sin6_addr, &s2->v6.sin6_addr); } /* Fills in the source address(saddr) based on the destination address(daddr) * and asoc's bind address list. */ static void sctp_v6_get_saddr(struct sctp_sock *sk, struct sctp_transport *t, struct flowi *fl) { struct flowi6 *fl6 = &fl->u.ip6; union sctp_addr *saddr = &t->saddr; pr_debug("%s: asoc:%p dst:%p\n", __func__, t->asoc, t->dst); if (t->dst) { saddr->v6.sin6_family = AF_INET6; saddr->v6.sin6_addr = fl6->saddr; } } /* Make a copy of all potential local addresses. */ static void sctp_v6_copy_addrlist(struct list_head *addrlist, struct net_device *dev) { struct inet6_dev *in6_dev; struct inet6_ifaddr *ifp; struct sctp_sockaddr_entry *addr; rcu_read_lock(); if ((in6_dev = __in6_dev_get(dev)) == NULL) { rcu_read_unlock(); return; } read_lock_bh(&in6_dev->lock); list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { /* Add the address to the local list. */ addr = kzalloc(sizeof(*addr), GFP_ATOMIC); if (addr) { addr->a.v6.sin6_family = AF_INET6; addr->a.v6.sin6_addr = ifp->addr; addr->a.v6.sin6_scope_id = dev->ifindex; addr->valid = 1; INIT_LIST_HEAD(&addr->list); list_add_tail(&addr->list, addrlist); } } read_unlock_bh(&in6_dev->lock); rcu_read_unlock(); } /* Initialize a sockaddr_storage from in incoming skb. */ static void sctp_v6_from_skb(union sctp_addr *addr, struct sk_buff *skb, int is_saddr) { /* Always called on head skb, so this is safe */ struct sctphdr *sh = sctp_hdr(skb); struct sockaddr_in6 *sa = &addr->v6; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; /* FIXME */ addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; if (is_saddr) { sa->sin6_port = sh->source; sa->sin6_addr = ipv6_hdr(skb)->saddr; } else { sa->sin6_port = sh->dest; sa->sin6_addr = ipv6_hdr(skb)->daddr; } } /* Initialize an sctp_addr from a socket. */ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = 0; addr->v6.sin6_addr = sk->sk_v6_rcv_saddr; } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) { if (addr->sa.sa_family == AF_INET) { sk->sk_v6_rcv_saddr.s6_addr32[0] = 0; sk->sk_v6_rcv_saddr.s6_addr32[1] = 0; sk->sk_v6_rcv_saddr.s6_addr32[2] = htonl(0x0000ffff); sk->sk_v6_rcv_saddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { sk->sk_v6_rcv_saddr = addr->v6.sin6_addr; } } /* Initialize sk->sk_daddr from sctp_addr. */ static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) { if (addr->sa.sa_family == AF_INET) { sk->sk_v6_daddr.s6_addr32[0] = 0; sk->sk_v6_daddr.s6_addr32[1] = 0; sk->sk_v6_daddr.s6_addr32[2] = htonl(0x0000ffff); sk->sk_v6_daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; } else { sk->sk_v6_daddr = addr->v6.sin6_addr; } } /* Initialize a sctp_addr from an address parameter. */ static bool sctp_v6_from_addr_param(union sctp_addr *addr, union sctp_addr_param *param, __be16 port, int iif) { if (ntohs(param->v6.param_hdr.length) < sizeof(struct sctp_ipv6addr_param)) return false; addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; /* BUG */ addr->v6.sin6_addr = param->v6.addr; addr->v6.sin6_scope_id = iif; return true; } /* Initialize an address parameter from a sctp_addr and return the length * of the address parameter. */ static int sctp_v6_to_addr_param(const union sctp_addr *addr, union sctp_addr_param *param) { int length = sizeof(struct sctp_ipv6addr_param); param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS; param->v6.param_hdr.length = htons(length); param->v6.addr = addr->v6.sin6_addr; return length; } /* Initialize a sctp_addr from struct in6_addr. */ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, __be16 port) { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = *saddr; addr->v6.sin6_scope_id = 0; } static int __sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { if (addr1->sa.sa_family != addr2->sa.sa_family) { if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr2->v6.sin6_addr) && addr2->v6.sin6_addr.s6_addr32[3] == addr1->v4.sin_addr.s_addr) return 1; if (addr2->sa.sa_family == AF_INET && addr1->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr1->v6.sin6_addr) && addr1->v6.sin6_addr.s6_addr32[3] == addr2->v4.sin_addr.s_addr) return 1; return 0; } if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) return 0; /* If this is a linklocal address, compare the scope_id. */ if ((ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) && addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id && addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id) return 0; return 1; } /* Compare addresses exactly. * v4-mapped-v6 is also in consideration. */ static int sctp_v6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2) { return __sctp_v6_cmp_addr(addr1, addr2) && addr1->v6.sin6_port == addr2->v6.sin6_port; } /* Initialize addr struct to INADDR_ANY. */ static void sctp_v6_inaddr_any(union sctp_addr *addr, __be16 port) { memset(addr, 0x00, sizeof(union sctp_addr)); addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = port; } /* Is this a wildcard address? */ static int sctp_v6_is_any(const union sctp_addr *addr) { return ipv6_addr_any(&addr->v6.sin6_addr); } /* Should this be available for binding? */ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) { int type; struct net *net = sock_net(&sp->inet.sk); const struct in6_addr *in6 = (const struct in6_addr *)&addr->v6.sin6_addr; type = ipv6_addr_type(in6); if (IPV6_ADDR_ANY == type) return 1; if (type == IPV6_ADDR_MAPPED) { if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); return sctp_get_af_specific(AF_INET)->available(addr, sp); } if (!(type & IPV6_ADDR_UNICAST)) return 0; return sp->inet.freebind || net->ipv6.sysctl.ip_nonlocal_bind || ipv6_chk_addr(net, in6, NULL, 0); } /* This function checks if the address is a valid address to be used for * SCTP. * * Output: * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp, const struct sk_buff *skb) { int ret = ipv6_addr_type(&addr->v6.sin6_addr); /* Support v4-mapped-v6 address. */ if (ret == IPV6_ADDR_MAPPED) { /* Note: This routine is used in input, so v4-mapped-v6 * are disallowed here when there is no sctp_sock. */ if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb); } /* Is this a non-unicast address */ if (!(ret & IPV6_ADDR_UNICAST)) return 0; return 1; } /* What is the scope of 'addr'? */ static enum sctp_scope sctp_v6_scope(union sctp_addr *addr) { enum sctp_scope retval; int v6scope; /* The IPv6 scope is really a set of bit fields. * See IFA_* in <net/if_inet6.h>. Map to a generic SCTP scope. */ v6scope = ipv6_addr_scope(&addr->v6.sin6_addr); switch (v6scope) { case IFA_HOST: retval = SCTP_SCOPE_LOOPBACK; break; case IFA_LINK: retval = SCTP_SCOPE_LINK; break; case IFA_SITE: retval = SCTP_SCOPE_PRIVATE; break; default: retval = SCTP_SCOPE_GLOBAL; break; } return retval; } /* Create and initialize a new sk for the socket to be returned by accept(). */ static struct sock *sctp_v6_create_accept_sk(struct sock *sk, struct sctp_association *asoc, bool kern) { struct sock *newsk; struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct sctp6_sock *newsctp6sk; struct ipv6_txoptions *opt; newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern); if (!newsk) goto out; sock_init_data(NULL, newsk); sctp_copy_sock(newsk, sk, asoc); sock_reset_flag(sk, SOCK_ZAPPED); newsctp6sk = (struct sctp6_sock *)newsk; inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); rcu_read_unlock(); /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() * and getpeername(). */ sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; sk_refcnt_debug_inc(newsk); if (newsk->sk_prot->init(newsk)) { sk_common_release(newsk); newsk = NULL; } out: return newsk; } /* Format a sockaddr for return to user space. This makes sure the return is * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option. */ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { if (sp->v4mapped) { if (addr->sa.sa_family == AF_INET) sctp_v4_map_v6(addr); } else { if (addr->sa.sa_family == AF_INET6 && ipv6_addr_v4mapped(&addr->v6.sin6_addr)) sctp_v6_map_v4(addr); } if (addr->sa.sa_family == AF_INET) { memset(addr->v4.sin_zero, 0, sizeof(addr->v4.sin_zero)); return sizeof(struct sockaddr_in); } return sizeof(struct sockaddr_in6); } /* Where did this skb come from? */ static int sctp_v6_skb_iif(const struct sk_buff *skb) { return IP6CB(skb)->iif; } /* Was this packet marked by Explicit Congestion Notification? */ static int sctp_v6_is_ce(const struct sk_buff *skb) { return *((__u32 *)(ipv6_hdr(skb))) & (__force __u32)htonl(1 << 20); } /* Dump the v6 addr to the seq file. */ static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) { seq_printf(seq, "%pI6 ", &addr->v6.sin6_addr); } static void sctp_v6_ecn_capable(struct sock *sk) { inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } /* Initialize a PF_INET msgname from a ulpevent. */ static void sctp_inet6_event_msgname(struct sctp_ulpevent *event, char *msgname, int *addrlen) { union sctp_addr *addr; struct sctp_association *asoc; union sctp_addr *paddr; if (!msgname) return; addr = (union sctp_addr *)msgname; asoc = event->asoc; paddr = &asoc->peer.primary_addr; if (paddr->sa.sa_family == AF_INET) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = htons(asoc->peer.port); addr->v4.sin_addr = paddr->v4.sin_addr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; if (ipv6_addr_type(&paddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = paddr->v6.sin6_scope_id; else addr->v6.sin6_scope_id = 0; addr->v6.sin6_port = htons(asoc->peer.port); addr->v6.sin6_addr = paddr->v6.sin6_addr; } *addrlen = sctp_v6_addr_to_user(sctp_sk(asoc->base.sk), addr); } /* Initialize a msg_name from an inbound skb. */ static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, int *addr_len) { union sctp_addr *addr; struct sctphdr *sh; if (!msgname) return; addr = (union sctp_addr *)msgname; sh = sctp_hdr(skb); if (ip_hdr(skb)->version == 4) { addr->v4.sin_family = AF_INET; addr->v4.sin_port = sh->source; addr->v4.sin_addr.s_addr = ip_hdr(skb)->saddr; } else { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_flowinfo = 0; addr->v6.sin6_port = sh->source; addr->v6.sin6_addr = ipv6_hdr(skb)->saddr; if (ipv6_addr_type(&addr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) addr->v6.sin6_scope_id = sctp_v6_skb_iif(skb); else addr->v6.sin6_scope_id = 0; } *addr_len = sctp_v6_addr_to_user(sctp_sk(skb->sk), addr); } /* Do we support this AF? */ static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) { switch (family) { case AF_INET6: return 1; /* v4-mapped-v6 addresses */ case AF_INET: if (!__ipv6_only_sock(sctp_opt2sk(sp))) return 1; default: return 0; } } /* Address matching with wildcards allowed. This extra level * of indirection lets us choose whether a PF_INET6 should * disallow any v4 addresses if we so choose. */ static int sctp_inet6_cmp_addr(const union sctp_addr *addr1, const union sctp_addr *addr2, struct sctp_sock *opt) { struct sock *sk = sctp_opt2sk(opt); struct sctp_af *af1, *af2; af1 = sctp_get_af_specific(addr1->sa.sa_family); af2 = sctp_get_af_specific(addr2->sa.sa_family); if (!af1 || !af2) return 0; /* If the socket is IPv6 only, v4 addrs will not match */ if (__ipv6_only_sock(sk) && af1 != af2) return 0; /* Today, wildcard AF_INET/AF_INET6. */ if (sctp_is_any(sk, addr1) || sctp_is_any(sk, addr2)) return 1; if (addr1->sa.sa_family == AF_INET && addr2->sa.sa_family == AF_INET) return addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr; return __sctp_v6_cmp_addr(addr1, addr2); } /* Verify that the provided sockaddr looks bindable. Common verification, * has already been taken care of. */ static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) { struct sctp_af *af; /* ASSERT: address family has already been verified. */ if (addr->sa.sa_family != AF_INET6) af = sctp_get_af_specific(addr->sa.sa_family); else { int type = ipv6_addr_type(&addr->v6.sin6_addr); struct net_device *dev; if (type & IPV6_ADDR_LINKLOCAL) { struct net *net; if (!addr->v6.sin6_scope_id) return 0; net = sock_net(&opt->inet.sk); rcu_read_lock(); dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id); if (!dev || !(opt->inet.freebind || net->ipv6.sysctl.ip_nonlocal_bind || ipv6_chk_addr(net, &addr->v6.sin6_addr, dev, 0))) { rcu_read_unlock(); return 0; } rcu_read_unlock(); } af = opt->pf->af; } return af->available(addr, opt); } /* Verify that the provided sockaddr looks sendable. Common verification, * has already been taken care of. */ static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr) { struct sctp_af *af = NULL; /* ASSERT: address family has already been verified. */ if (addr->sa.sa_family != AF_INET6) af = sctp_get_af_specific(addr->sa.sa_family); else { int type = ipv6_addr_type(&addr->v6.sin6_addr); struct net_device *dev; if (type & IPV6_ADDR_LINKLOCAL) { if (!addr->v6.sin6_scope_id) return 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(&opt->inet.sk), addr->v6.sin6_scope_id); rcu_read_unlock(); if (!dev) return 0; } af = opt->pf->af; } return af != NULL; } /* Fill in Supported Address Type information for INIT and INIT-ACK * chunks. Note: In the future, we may want to look at sock options * to determine whether a PF_INET6 socket really wants to have IPV4 * addresses. * Returns number of addresses supported. */ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, __be16 *types) { types[0] = SCTP_PARAM_IPV6_ADDRESS; if (!opt || !ipv6_only_sock(sctp_opt2sk(opt))) { types[1] = SCTP_PARAM_IPV4_ADDRESS; return 2; } return 1; } /* Handle SCTP_I_WANT_MAPPED_V4_ADDR for getpeername() and getsockname() */ static int sctp_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) { int rc; rc = inet6_getname(sock, uaddr, uaddr_len, peer); if (rc != 0) return rc; *uaddr_len = sctp_v6_addr_to_user(sctp_sk(sock->sk), (union sctp_addr *)uaddr); return rc; } static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = sctp_inet_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = sctp_getname, .poll = sctp_poll, .ioctl = inet6_ioctl, .listen = sctp_inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif }; static struct inet_protosw sctpv6_seqpacket_protosw = { .type = SOCK_SEQPACKET, .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG }; static struct inet_protosw sctpv6_stream_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_SCTP, .prot = &sctpv6_prot, .ops = &inet6_seqpacket_ops, .flags = SCTP_PROTOSW_FLAG, }; static int sctp6_rcv(struct sk_buff *skb) { return sctp_rcv(skb) ? -1 : 0; } static const struct inet6_protocol sctpv6_protocol = { .handler = sctp6_rcv, .err_handler = sctp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; static struct sctp_af sctp_af_inet6 = { .sa_family = AF_INET6, .sctp_xmit = sctp_v6_xmit, .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .get_dst = sctp_v6_get_dst, .get_saddr = sctp_v6_get_saddr, .copy_addrlist = sctp_v6_copy_addrlist, .from_skb = sctp_v6_from_skb, .from_sk = sctp_v6_from_sk, .from_addr_param = sctp_v6_from_addr_param, .to_addr_param = sctp_v6_to_addr_param, .cmp_addr = sctp_v6_cmp_addr, .scope = sctp_v6_scope, .addr_valid = sctp_v6_addr_valid, .inaddr_any = sctp_v6_inaddr_any, .is_any = sctp_v6_is_any, .available = sctp_v6_available, .skb_iif = sctp_v6_skb_iif, .is_ce = sctp_v6_is_ce, .seq_dump_addr = sctp_v6_seq_dump_addr, .ecn_capable = sctp_v6_ecn_capable, .net_header_len = sizeof(struct ipv6hdr), .sockaddr_len = sizeof(struct sockaddr_in6), #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, #endif }; static struct sctp_pf sctp_pf_inet6 = { .event_msgname = sctp_inet6_event_msgname, .skb_msgname = sctp_inet6_skb_msgname, .af_supported = sctp_inet6_af_supported, .cmp_addr = sctp_inet6_cmp_addr, .bind_verify = sctp_inet6_bind_verify, .send_verify = sctp_inet6_send_verify, .supported_addrs = sctp_inet6_supported_addrs, .create_accept_sk = sctp_v6_create_accept_sk, .addr_to_user = sctp_v6_addr_to_user, .to_sk_saddr = sctp_v6_to_sk_saddr, .to_sk_daddr = sctp_v6_to_sk_daddr, .af = &sctp_af_inet6, }; /* Initialize IPv6 support and register with socket layer. */ void sctp_v6_pf_init(void) { /* Register the SCTP specific PF_INET6 functions. */ sctp_register_pf(&sctp_pf_inet6, PF_INET6); /* Register the SCTP specific AF_INET6 functions. */ sctp_register_af(&sctp_af_inet6); } void sctp_v6_pf_exit(void) { list_del(&sctp_af_inet6.list); } /* Initialize IPv6 support and register with socket layer. */ int sctp_v6_protosw_init(void) { int rc; rc = proto_register(&sctpv6_prot, 1); if (rc) return rc; /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */ inet6_register_protosw(&sctpv6_seqpacket_protosw); inet6_register_protosw(&sctpv6_stream_protosw); return 0; } void sctp_v6_protosw_exit(void) { inet6_unregister_protosw(&sctpv6_seqpacket_protosw); inet6_unregister_protosw(&sctpv6_stream_protosw); proto_unregister(&sctpv6_prot); } /* Register with inet6 layer. */ int sctp_v6_add_protocol(void) { /* Register notifier for inet6 address additions/deletions. */ register_inet6addr_notifier(&sctp_inet6addr_notifier); if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) return -EAGAIN; return 0; } /* Unregister with inet6 layer. */ void sctp_v6_del_protocol(void) { inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); unregister_inet6addr_notifier(&sctp_inet6addr_notifier); }
23 9 8 3 3 2 2 2 3 4 4 3 3 2 4 5 5 4 2 2 2 1 9 1 2 5 4 9 7 5 3 1 7 4 3 2 1 3 1 3 3 2 1 1 3 3 3 9 8 8 9 5 9 1 2 3 2 1 1 9 4 3 2 2 4 17 16 15 15 27 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 /* * * Author Karsten Keil <kkeil@novell.com> * * Copyright 2008 by Karsten Keil <kkeil@novell.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * */ #include <linux/mISDNif.h> #include <linux/slab.h> #include <linux/export.h> #include "core.h" static u_int *debug; static struct proto mISDN_proto = { .name = "misdn", .owner = THIS_MODULE, .obj_size = sizeof(struct mISDN_sock) }; #define _pms(sk) ((struct mISDN_sock *)sk) static struct mISDN_sock_list data_sockets = { .lock = __RW_LOCK_UNLOCKED(data_sockets.lock) }; static struct mISDN_sock_list base_sockets = { .lock = __RW_LOCK_UNLOCKED(base_sockets.lock) }; #define L2_HEADER_LEN 4 static inline struct sk_buff * _l2_alloc_skb(unsigned int len, gfp_t gfp_mask) { struct sk_buff *skb; skb = alloc_skb(len + L2_HEADER_LEN, gfp_mask); if (likely(skb)) skb_reserve(skb, L2_HEADER_LEN); return skb; } static void mISDN_sock_link(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_add_node(sk, &l->head); write_unlock_bh(&l->lock); } static void mISDN_sock_unlink(struct mISDN_sock_list *l, struct sock *sk) { write_lock_bh(&l->lock); sk_del_node_init(sk); write_unlock_bh(&l->lock); } static int mISDN_send(struct mISDNchannel *ch, struct sk_buff *skb) { struct mISDN_sock *msk; int err; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s len %d %p\n", __func__, skb->len, skb); if (msk->sk.sk_state == MISDN_CLOSED) return -EUNATCH; __net_timestamp(skb); err = sock_queue_rcv_skb(&msk->sk, skb); if (err) printk(KERN_WARNING "%s: error %d\n", __func__, err); return err; } static int mISDN_ctrl(struct mISDNchannel *ch, u_int cmd, void *arg) { struct mISDN_sock *msk; msk = container_of(ch, struct mISDN_sock, ch); if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %x, %p)\n", __func__, ch, cmd, arg); switch (cmd) { case CLOSE_CHANNEL: msk->sk.sk_state = MISDN_CLOSED; break; } return 0; } static inline void mISDN_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { struct timeval tv; if (_pms(sk)->cmask & MISDN_TIME_STAMP) { skb_get_timestamp(skb, &tv); put_cmsg(msg, SOL_MISDN, MISDN_TIME_STAMP, sizeof(tv), &tv); } } static int mISDN_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sk_buff *skb; struct sock *sk = sock->sk; int copied, err; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d, flags %x ch.nr %d, proto %x\n", __func__, (int)len, flags, _pms(sk)->ch.nr, sk->sk_protocol); if (flags & (MSG_OOB)) return -EOPNOTSUPP; if (sk->sk_state == MISDN_CLOSED) return 0; skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); if (!skb) return err; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) { maddr->channel = (mISDN_HEAD_ID(skb) >> 16) & 0xff; maddr->tei = (mISDN_HEAD_ID(skb) >> 8) & 0xff; maddr->sapi = mISDN_HEAD_ID(skb) & 0xff; } else { maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xFF; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xFF; } msg->msg_namelen = sizeof(*maddr); } copied = skb->len + MISDN_HEADER_LEN; if (len < copied) { if (flags & MSG_PEEK) refcount_dec(&skb->users); else skb_queue_head(&sk->sk_receive_queue, skb); return -ENOSPC; } memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); err = skb_copy_datagram_msg(skb, 0, msg, copied); mISDN_sock_cmsg(sk, msg, skb); skb_free_datagram(sk, skb); return err ? : copied; } static int mISDN_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb; int err = -ENOMEM; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: len %d flags %x ch %d proto %x\n", __func__, (int)len, msg->msg_flags, _pms(sk)->ch.nr, sk->sk_protocol); if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_NOSIGNAL | MSG_ERRQUEUE)) return -EINVAL; if (len < MISDN_HEADER_LEN) return -EINVAL; if (sk->sk_state != MISDN_BOUND) return -EBADFD; lock_sock(sk); skb = _l2_alloc_skb(len, GFP_KERNEL); if (!skb) goto done; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { err = -EFAULT; goto done; } memcpy(mISDN_HEAD_P(skb), skb->data, MISDN_HEADER_LEN); skb_pull(skb, MISDN_HEADER_LEN); if (msg->msg_namelen >= sizeof(struct sockaddr_mISDN)) { /* if we have a address, we use it */ DECLARE_SOCKADDR(struct sockaddr_mISDN *, maddr, msg->msg_name); mISDN_HEAD_ID(skb) = maddr->channel; } else { /* use default for L2 messages */ if ((sk->sk_protocol == ISDN_P_LAPD_TE) || (sk->sk_protocol == ISDN_P_LAPD_NT)) mISDN_HEAD_ID(skb) = _pms(sk)->ch.nr; } if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s: ID:%x\n", __func__, mISDN_HEAD_ID(skb)); err = -ENODEV; if (!_pms(sk)->ch.peer) goto done; err = _pms(sk)->ch.recv(_pms(sk)->ch.peer, skb); if (err) goto done; else { skb = NULL; err = len; } done: if (skb) kfree_skb(skb); release_sock(sk); return err; } static int data_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: if (sk->sk_state == MISDN_BOUND) delete_channel(&_pms(sk)->ch); else mISDN_sock_unlink(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: delete_channel(&_pms(sk)->ch); mISDN_sock_unlink(&data_sockets, sk); break; } lock_sock(sk); sock_orphan(sk); skb_queue_purge(&sk->sk_receive_queue); release_sock(sk); sock_put(sk); return 0; } static int data_sock_ioctl_bound(struct sock *sk, unsigned int cmd, void __user *p) { struct mISDN_ctrl_req cq; int err = -EINVAL, val[2]; struct mISDNchannel *bchan, *next; lock_sock(sk); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } switch (cmd) { case IMCTRLREQ: if (copy_from_user(&cq, p, sizeof(cq))) { err = -EFAULT; break; } if ((sk->sk_protocol & ~ISDN_P_B_MASK) == ISDN_P_B_START) { list_for_each_entry_safe(bchan, next, &_pms(sk)->dev->bchannels, list) { if (bchan->nr == cq.channel) { err = bchan->ctrl(bchan, CONTROL_CHANNEL, &cq); break; } } } else err = _pms(sk)->dev->D.ctrl(&_pms(sk)->dev->D, CONTROL_CHANNEL, &cq); if (err) break; if (copy_to_user(p, &cq, sizeof(cq))) err = -EFAULT; break; case IMCLEAR_L2: if (sk->sk_protocol != ISDN_P_LAPD_NT) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; case IMHOLD_L1: if (sk->sk_protocol != ISDN_P_LAPD_NT && sk->sk_protocol != ISDN_P_LAPD_TE) { err = -EINVAL; break; } val[0] = cmd; if (get_user(val[1], (int __user *)p)) { err = -EFAULT; break; } err = _pms(sk)->dev->teimgr->ctrl(_pms(sk)->dev->teimgr, CONTROL_CHANNEL, val); break; default: err = -EINVAL; break; } done: release_sock(sk); return err; } static int data_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct sock *sk = sock->sk; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; default: if (sk->sk_state == MISDN_BOUND) err = data_sock_ioctl_bound(sk, cmd, (void __user *)arg); else err = -ENOTCONN; } return err; } static int data_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int len) { struct sock *sk = sock->sk; int err = 0, opt = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p, %d, %x, %p, %d)\n", __func__, sock, level, optname, optval, len); lock_sock(sk); switch (optname) { case MISDN_TIME_STAMP: if (get_user(opt, (int __user *)optval)) { err = -EFAULT; break; } if (opt) _pms(sk)->cmask |= MISDN_TIME_STAMP; else _pms(sk)->cmask &= ~MISDN_TIME_STAMP; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int data_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, opt; if (get_user(len, optlen)) return -EFAULT; if (len != sizeof(char)) return -EINVAL; switch (optname) { case MISDN_TIME_STAMP: if (_pms(sk)->cmask & MISDN_TIME_STAMP) opt = 1; else opt = 0; if (put_user(opt, optval)) return -EFAULT; break; default: return -ENOPROTOOPT; } return 0; } static int data_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; struct sock *csk; int err = 0; if (*debug & DEBUG_SOCKET) printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (addr_len != sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } if (sk->sk_protocol < ISDN_P_B_START) { read_lock_bh(&data_sockets.lock); sk_for_each(csk, &data_sockets.head) { if (sk == csk) continue; if (_pms(csk)->dev != _pms(sk)->dev) continue; if (csk->sk_protocol >= ISDN_P_B_START) continue; if (IS_ISDN_P_TE(csk->sk_protocol) == IS_ISDN_P_TE(sk->sk_protocol)) continue; read_unlock_bh(&data_sockets.lock); err = -EBUSY; goto done; } read_unlock_bh(&data_sockets.lock); } _pms(sk)->ch.send = mISDN_send; _pms(sk)->ch.ctrl = mISDN_ctrl; switch (sk->sk_protocol) { case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: mISDN_sock_unlink(&data_sockets, sk); err = connect_layer1(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); if (err) mISDN_sock_link(&data_sockets, sk); break; case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: err = create_l2entity(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = connect_Bstack(_pms(sk)->dev, &_pms(sk)->ch, sk->sk_protocol, maddr); break; default: err = -EPROTONOSUPPORT; } if (err) goto done; sk->sk_state = MISDN_BOUND; _pms(sk)->ch.protocol = sk->sk_protocol; done: release_sock(sk); return err; } static int data_sock_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; if (!_pms(sk)->dev) return -EBADFD; lock_sock(sk); *addr_len = sizeof(*maddr); maddr->family = AF_ISDN; maddr->dev = _pms(sk)->dev->id; maddr->channel = _pms(sk)->ch.nr; maddr->sapi = _pms(sk)->ch.addr & 0xff; maddr->tei = (_pms(sk)->ch.addr >> 8) & 0xff; release_sock(sk); return 0; } static const struct proto_ops data_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = data_sock_release, .ioctl = data_sock_ioctl, .bind = data_sock_bind, .getname = data_sock_getname, .sendmsg = mISDN_sock_sendmsg, .recvmsg = mISDN_sock_recvmsg, .poll = datagram_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = data_sock_setsockopt, .getsockopt = data_sock_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int data_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &data_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&data_sockets, sk); return 0; } static int base_sock_release(struct socket *sock) { struct sock *sk = sock->sk; printk(KERN_DEBUG "%s(%p) sk=%p\n", __func__, sock, sk); if (!sk) return 0; mISDN_sock_unlink(&base_sockets, sk); sock_orphan(sk); sock_put(sk); return 0; } static int base_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { int err = 0, id; struct mISDNdevice *dev; struct mISDNversion ver; switch (cmd) { case IMGETVERSION: ver.major = MISDN_MAJOR_VERSION; ver.minor = MISDN_MINOR_VERSION; ver.release = MISDN_RELEASE; if (copy_to_user((void __user *)arg, &ver, sizeof(ver))) err = -EFAULT; break; case IMGETCOUNT: id = get_mdevice_count(); if (put_user(id, (int __user *)arg)) err = -EFAULT; break; case IMGETDEVINFO: if (get_user(id, (int __user *)arg)) { err = -EFAULT; break; } dev = get_mdevice(id); if (dev) { struct mISDN_devinfo di; memset(&di, 0, sizeof(di)); di.id = dev->id; di.Dprotocols = dev->Dprotocols; di.Bprotocols = dev->Bprotocols | get_all_Bprotocols(); di.protocol = dev->D.protocol; memcpy(di.channelmap, dev->channelmap, sizeof(di.channelmap)); di.nrbchan = dev->nrbchan; strscpy(di.name, dev_name(&dev->dev), sizeof(di.name)); if (copy_to_user((void __user *)arg, &di, sizeof(di))) err = -EFAULT; } else err = -ENODEV; break; case IMSETDEVNAME: { struct mISDN_devrename dn; if (copy_from_user(&dn, (void __user *)arg, sizeof(dn))) { err = -EFAULT; break; } dn.name[sizeof(dn.name) - 1] = '\0'; dev = get_mdevice(dn.id); if (dev) err = device_rename(&dev->dev, dn.name); else err = -ENODEV; } break; default: err = -EINVAL; } return err; } static int base_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_mISDN *maddr = (struct sockaddr_mISDN *) addr; struct sock *sk = sock->sk; int err = 0; if (addr_len < sizeof(struct sockaddr_mISDN)) return -EINVAL; if (!maddr || maddr->family != AF_ISDN) return -EINVAL; lock_sock(sk); if (_pms(sk)->dev) { err = -EALREADY; goto done; } _pms(sk)->dev = get_mdevice(maddr->dev); if (!_pms(sk)->dev) { err = -ENODEV; goto done; } sk->sk_state = MISDN_BOUND; done: release_sock(sk); return err; } static const struct proto_ops base_sock_ops = { .family = PF_ISDN, .owner = THIS_MODULE, .release = base_sock_release, .ioctl = base_sock_ioctl, .bind = base_sock_bind, .getname = sock_no_getname, .sendmsg = sock_no_sendmsg, .recvmsg = sock_no_recvmsg, .poll = sock_no_poll, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_no_setsockopt, .getsockopt = sock_no_getsockopt, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .mmap = sock_no_mmap }; static int base_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (!capable(CAP_NET_RAW)) return -EPERM; sk = sk_alloc(net, PF_ISDN, GFP_KERNEL, &mISDN_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); sock->ops = &base_sock_ops; sock->state = SS_UNCONNECTED; sock_reset_flag(sk, SOCK_ZAPPED); sk->sk_protocol = protocol; sk->sk_state = MISDN_OPEN; mISDN_sock_link(&base_sockets, sk); return 0; } static int mISDN_sock_create(struct net *net, struct socket *sock, int proto, int kern) { int err = -EPROTONOSUPPORT; switch (proto) { case ISDN_P_BASE: err = base_sock_create(net, sock, proto, kern); break; case ISDN_P_TE_S0: case ISDN_P_NT_S0: case ISDN_P_TE_E1: case ISDN_P_NT_E1: case ISDN_P_LAPD_TE: case ISDN_P_LAPD_NT: case ISDN_P_B_RAW: case ISDN_P_B_HDLC: case ISDN_P_B_X75SLP: case ISDN_P_B_L2DTMF: case ISDN_P_B_L2DSP: case ISDN_P_B_L2DSPHDLC: err = data_sock_create(net, sock, proto, kern); break; default: return err; } return err; } static const struct net_proto_family mISDN_sock_family_ops = { .owner = THIS_MODULE, .family = PF_ISDN, .create = mISDN_sock_create, }; int misdn_sock_init(u_int *deb) { int err; debug = deb; err = sock_register(&mISDN_sock_family_ops); if (err) printk(KERN_ERR "%s: error(%d)\n", __func__, err); return err; } void misdn_sock_cleanup(void) { sock_unregister(PF_ISDN); }
15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 /* * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> * Copyright (c) 2012-2013 Pablo Neira Ayuso <pablo@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/init.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/nf_tables.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/netfilter/nf_tables_ipv4.h> static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo_ipv4(&pkt, skb, state); return nft_do_chain(&pkt, priv); } static unsigned int nft_ipv4_output(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (unlikely(skb->len < sizeof(struct iphdr) || ip_hdr(skb)->ihl < sizeof(struct iphdr) / 4)) { if (net_ratelimit()) pr_info("nf_tables_ipv4: ignoring short SOCK_RAW " "packet\n"); return NF_ACCEPT; } return nft_do_chain_ipv4(priv, skb, state); } struct nft_af_info nft_af_ipv4 __read_mostly = { .family = NFPROTO_IPV4, .nhooks = NF_INET_NUMHOOKS, .owner = THIS_MODULE, .nops = 1, .hooks = { [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, [NF_INET_LOCAL_OUT] = nft_ipv4_output, [NF_INET_FORWARD] = nft_do_chain_ipv4, [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, }, }; EXPORT_SYMBOL_GPL(nft_af_ipv4); static int nf_tables_ipv4_init_net(struct net *net) { net->nft.ipv4 = kmalloc(sizeof(struct nft_af_info), GFP_KERNEL); if (net->nft.ipv4 == NULL) return -ENOMEM; memcpy(net->nft.ipv4, &nft_af_ipv4, sizeof(nft_af_ipv4)); if (nft_register_afinfo(net, net->nft.ipv4) < 0) goto err; return 0; err: kfree(net->nft.ipv4); return -ENOMEM; } static void nf_tables_ipv4_exit_net(struct net *net) { nft_unregister_afinfo(net, net->nft.ipv4); kfree(net->nft.ipv4); } static struct pernet_operations nf_tables_ipv4_net_ops = { .init = nf_tables_ipv4_init_net, .exit = nf_tables_ipv4_exit_net, }; static const struct nf_chain_type filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_IPV4, .owner = THIS_MODULE, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), }; static int __init nf_tables_ipv4_init(void) { int ret; ret = nft_register_chain_type(&filter_ipv4); if (ret < 0) return ret; ret = register_pernet_subsys(&nf_tables_ipv4_net_ops); if (ret < 0) nft_unregister_chain_type(&filter_ipv4); return ret; } static void __exit nf_tables_ipv4_exit(void) { unregister_pernet_subsys(&nf_tables_ipv4_net_ops); nft_unregister_chain_type(&filter_ipv4); } module_init(nf_tables_ipv4_init); module_exit(nf_tables_ipv4_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_FAMILY(AF_INET);
1333 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 /* fs/ internal definitions * * Copyright (C) 2006 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ struct super_block; struct file_system_type; struct iomap; struct iomap_ops; struct linux_binprm; struct path; struct mount; struct shrink_control; /* * block_dev.c */ #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); extern int __sync_blockdev(struct block_device *bdev, int wait); #else static inline void bdev_cache_init(void) { } static inline int __sync_blockdev(struct block_device *bdev, int wait) { return 0; } #endif /* * buffer.c */ extern void guard_bio_eod(int rw, struct bio *bio); extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, get_block_t *get_block, struct iomap *iomap); /* * char_dev.c */ extern void __init chrdev_init(void); /* * namei.c */ extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); /* * namespace.c */ extern void *copy_mount_options(const void __user *); extern char *copy_mount_string(const void __user *); extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, struct path *); extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); extern int mnt_want_write_file_path(struct file *); extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); extern void mnt_drop_write_file_path(struct file *); /* * fs_struct.c */ extern void chroot_fs_refs(const struct path *, const struct path *); /* * file_table.c */ extern struct file *get_empty_filp(void); /* * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); extern bool trylock_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, int, const char *, void *); extern struct super_block *user_get_super(dev_t); /* * open.c */ struct open_flags { int open_flag; umode_t mode; int acc_mode; int intent; int lookup_flags; }; extern struct file *do_filp_open(int dfd, struct filename *pathname, const struct open_flags *op); extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); /* * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); extern bool __atime_needs_update(const struct path *, struct inode *, bool); static inline bool atime_needs_update_rcu(const struct path *path, struct inode *inode) { return __atime_needs_update(path, inode, true); } /* * fs-writeback.c */ extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); extern int invalidate_inodes(struct super_block *, bool); /* * dcache.c */ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern int d_set_mounted(struct dentry *dentry); extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); extern struct dentry *d_alloc_cursor(struct dentry *); /* * read_write.c */ extern int rw_verify_area(int, struct file *, const loff_t *, size_t); /* * pipe.c */ extern const struct file_operations pipefifo_fops; /* * fs_pin.c */ extern void group_pin_kill(struct hlist_head *p); extern void mnt_pin_kill(struct mount *m); /* * fs/nsfs.c */ extern const struct dentry_operations ns_dentry_operations; /* * fs/ioctl.c */ extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, unsigned long arg); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * iomap support: */ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, void *data, struct iomap *iomap); loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, const struct iomap_ops *ops, void *data, iomap_actor_t actor); /* direct-io.c: */ int sb_init_dio_done_wq(struct super_block *sb);
3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_CONNTRACK_ZONES_H #define _NF_CONNTRACK_ZONES_H #include <linux/netfilter/nf_conntrack_zones_common.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack_extend.h> static inline const struct nf_conntrack_zone * nf_ct_zone(const struct nf_conn *ct) { #ifdef CONFIG_NF_CONNTRACK_ZONES return &ct->zone; #else return &nf_ct_zone_dflt; #endif } static inline const struct nf_conntrack_zone * nf_ct_zone_init(struct nf_conntrack_zone *zone, u16 id, u8 dir, u8 flags) { zone->id = id; zone->flags = flags; zone->dir = dir; return zone; } static inline const struct nf_conntrack_zone * nf_ct_zone_tmpl(const struct nf_conn *tmpl, const struct sk_buff *skb, struct nf_conntrack_zone *tmp) { #ifdef CONFIG_NF_CONNTRACK_ZONES if (!tmpl) return &nf_ct_zone_dflt; if (tmpl->zone.flags & NF_CT_FLAG_MARK) return nf_ct_zone_init(tmp, skb->mark, tmpl->zone.dir, 0); #endif return nf_ct_zone(tmpl); } static inline void nf_ct_zone_add(struct nf_conn *ct, const struct nf_conntrack_zone *zone) { #ifdef CONFIG_NF_CONNTRACK_ZONES ct->zone = *zone; #endif } static inline bool nf_ct_zone_matches_dir(const struct nf_conntrack_zone *zone, enum ip_conntrack_dir dir) { return zone->dir & (1 << dir); } static inline u16 nf_ct_zone_id(const struct nf_conntrack_zone *zone, enum ip_conntrack_dir dir) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone_matches_dir(zone, dir) ? zone->id : NF_CT_DEFAULT_ZONE_ID; #else return NF_CT_DEFAULT_ZONE_ID; #endif } static inline bool nf_ct_zone_equal(const struct nf_conn *a, const struct nf_conntrack_zone *b, enum ip_conntrack_dir dir) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone_id(nf_ct_zone(a), dir) == nf_ct_zone_id(b, dir); #else return true; #endif } static inline bool nf_ct_zone_equal_any(const struct nf_conn *a, const struct nf_conntrack_zone *b) { #ifdef CONFIG_NF_CONNTRACK_ZONES return nf_ct_zone(a)->id == b->id; #else return true; #endif } #endif /* IS_ENABLED(CONFIG_NF_CONNTRACK) */ #endif /* _NF_CONNTRACK_ZONES_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DELAY_H #define _LINUX_DELAY_H /* * Copyright (C) 1993 Linus Torvalds * * Delay routines, using a pre-computed "loops_per_jiffy" value. * * Please note that ndelay(), udelay() and mdelay() may return early for * several reasons: * 1. computed loops_per_jiffy too low (due to the time taken to * execute the timer interrupt.) * 2. cache behaviour affecting the time it takes to execute the * loop function. * 3. CPU clock rate changes. * * Please see this thread: * http://lists.openwall.net/linux-kernel/2011/01/09/56 */ #include <linux/kernel.h> extern unsigned long loops_per_jiffy; #include <asm/delay.h> /* * Using udelay() for intervals greater than a few milliseconds can * risk overflow for high loops_per_jiffy (high bogomips) machines. The * mdelay() provides a wrapper to prevent this. For delays greater * than MAX_UDELAY_MS milliseconds, the wrapper is used. Architecture * specific values can be defined in asm-???/delay.h as an override. * The 2nd mdelay() definition ensures GCC will optimize away the * while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G. */ #ifndef MAX_UDELAY_MS #define MAX_UDELAY_MS 5 #endif #ifndef mdelay #define mdelay(n) (\ (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \ ({unsigned long __ms=(n); while (__ms--) udelay(1000);})) #endif #ifndef ndelay static inline void ndelay(unsigned long x) { udelay(DIV_ROUND_UP(x, 1000)); } #define ndelay(x) ndelay(x) #endif extern unsigned long lpj_fine; void calibrate_delay(void); void msleep(unsigned int msecs); unsigned long msleep_interruptible(unsigned int msecs); void usleep_range(unsigned long min, unsigned long max); static inline void ssleep(unsigned int seconds) { msleep(seconds * 1000); } #endif /* defined(_LINUX_DELAY_H) */
2 2 2 13 13 13 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 // SPDX-License-Identifier: GPL-2.0 #include <linux/proc_fs.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/bonding.h> #include "bonding_priv.h" static void *bond_info_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct bonding *bond = seq->private; struct list_head *iter; struct slave *slave; loff_t off = 0; rcu_read_lock(); if (*pos == 0) return SEQ_START_TOKEN; bond_for_each_slave_rcu(bond, slave, iter) if (++off == *pos) return slave; return NULL; } static void *bond_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bonding *bond = seq->private; struct list_head *iter; struct slave *slave; bool found = false; ++*pos; if (v == SEQ_START_TOKEN) return bond_first_slave_rcu(bond); bond_for_each_slave_rcu(bond, slave, iter) { if (found) return slave; if (slave == v) found = true; } return NULL; } static void bond_info_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static void bond_info_show_master(struct seq_file *seq) { struct bonding *bond = seq->private; const struct bond_opt_value *optval; struct slave *curr, *primary; int i; curr = rcu_dereference(bond->curr_active_slave); seq_printf(seq, "Bonding Mode: %s", bond_mode_name(BOND_MODE(bond))); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac) { optval = bond_opt_get_val(BOND_OPT_FAIL_OVER_MAC, bond->params.fail_over_mac); seq_printf(seq, " (fail_over_mac %s)", optval->string); } seq_printf(seq, "\n"); if (bond_mode_uses_xmit_hash(bond)) { optval = bond_opt_get_val(BOND_OPT_XMIT_HASH, bond->params.xmit_policy); seq_printf(seq, "Transmit Hash Policy: %s (%d)\n", optval->string, bond->params.xmit_policy); } if (bond_uses_primary(bond)) { primary = rcu_dereference(bond->primary_slave); seq_printf(seq, "Primary Slave: %s", primary ? primary->dev->name : "None"); if (primary) { optval = bond_opt_get_val(BOND_OPT_PRIMARY_RESELECT, bond->params.primary_reselect); seq_printf(seq, " (primary_reselect %s)", optval->string); } seq_printf(seq, "\nCurrently Active Slave: %s\n", (curr) ? curr->dev->name : "None"); } seq_printf(seq, "MII Status: %s\n", netif_carrier_ok(bond->dev) ? "up" : "down"); seq_printf(seq, "MII Polling Interval (ms): %d\n", bond->params.miimon); seq_printf(seq, "Up Delay (ms): %d\n", bond->params.updelay * bond->params.miimon); seq_printf(seq, "Down Delay (ms): %d\n", bond->params.downdelay * bond->params.miimon); /* ARP information */ if (bond->params.arp_interval > 0) { int printed = 0; seq_printf(seq, "ARP Polling Interval (ms): %d\n", bond->params.arp_interval); seq_printf(seq, "ARP IP target/s (n.n.n.n form):"); for (i = 0; (i < BOND_MAX_ARP_TARGETS); i++) { if (!bond->params.arp_targets[i]) break; if (printed) seq_printf(seq, ","); seq_printf(seq, " %pI4", &bond->params.arp_targets[i]); printed = 1; } seq_printf(seq, "\n"); } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; seq_puts(seq, "\n802.3ad info\n"); seq_printf(seq, "LACP rate: %s\n", (bond->params.lacp_fast) ? "fast" : "slow"); seq_printf(seq, "Min links: %d\n", bond->params.min_links); optval = bond_opt_get_val(BOND_OPT_AD_SELECT, bond->params.ad_select); seq_printf(seq, "Aggregator selection policy (ad_select): %s\n", optval->string); if (capable(CAP_NET_ADMIN)) { seq_printf(seq, "System priority: %d\n", BOND_AD_INFO(bond).system.sys_priority); seq_printf(seq, "System MAC address: %pM\n", &BOND_AD_INFO(bond).system.sys_mac_addr); if (__bond_3ad_get_active_agg_info(bond, &ad_info)) { seq_printf(seq, "bond %s has no active aggregator\n", bond->dev->name); } else { seq_printf(seq, "Active Aggregator Info:\n"); seq_printf(seq, "\tAggregator ID: %d\n", ad_info.aggregator_id); seq_printf(seq, "\tNumber of ports: %d\n", ad_info.ports); seq_printf(seq, "\tActor Key: %d\n", ad_info.actor_key); seq_printf(seq, "\tPartner Key: %d\n", ad_info.partner_key); seq_printf(seq, "\tPartner Mac Address: %pM\n", ad_info.partner_system); } } } } static void bond_info_show_slave(struct seq_file *seq, const struct slave *slave) { struct bonding *bond = seq->private; seq_printf(seq, "\nSlave Interface: %s\n", slave->dev->name); seq_printf(seq, "MII Status: %s\n", bond_slave_link_status(slave->link)); if (slave->speed == SPEED_UNKNOWN) seq_printf(seq, "Speed: %s\n", "Unknown"); else seq_printf(seq, "Speed: %d Mbps\n", slave->speed); if (slave->duplex == DUPLEX_UNKNOWN) seq_printf(seq, "Duplex: %s\n", "Unknown"); else seq_printf(seq, "Duplex: %s\n", slave->duplex ? "full" : "half"); seq_printf(seq, "Link Failure Count: %u\n", slave->link_failure_count); seq_printf(seq, "Permanent HW addr: %*phC\n", slave->dev->addr_len, slave->perm_hwaddr); seq_printf(seq, "Slave queue ID: %d\n", slave->queue_id); if (BOND_MODE(bond) == BOND_MODE_8023AD) { const struct port *port = &SLAVE_AD_INFO(slave)->port; const struct aggregator *agg = port->aggregator; if (agg) { seq_printf(seq, "Aggregator ID: %d\n", agg->aggregator_identifier); seq_printf(seq, "Actor Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_actor_state)); seq_printf(seq, "Partner Churn State: %s\n", bond_3ad_churn_desc(port->sm_churn_partner_state)); seq_printf(seq, "Actor Churned Count: %d\n", port->churn_actor_count); seq_printf(seq, "Partner Churned Count: %d\n", port->churn_partner_count); if (capable(CAP_NET_ADMIN)) { seq_puts(seq, "details actor lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->actor_system_priority); seq_printf(seq, " system mac address: %pM\n", &port->actor_system); seq_printf(seq, " port key: %d\n", port->actor_oper_port_key); seq_printf(seq, " port priority: %d\n", port->actor_port_priority); seq_printf(seq, " port number: %d\n", port->actor_port_number); seq_printf(seq, " port state: %d\n", port->actor_oper_port_state); seq_puts(seq, "details partner lacp pdu:\n"); seq_printf(seq, " system priority: %d\n", port->partner_oper.system_priority); seq_printf(seq, " system mac address: %pM\n", &port->partner_oper.system); seq_printf(seq, " oper key: %d\n", port->partner_oper.key); seq_printf(seq, " port priority: %d\n", port->partner_oper.port_priority); seq_printf(seq, " port number: %d\n", port->partner_oper.port_number); seq_printf(seq, " port state: %d\n", port->partner_oper.port_state); } } else { seq_puts(seq, "Aggregator ID: N/A\n"); } } } static int bond_info_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_printf(seq, "%s\n", bond_version); bond_info_show_master(seq); } else bond_info_show_slave(seq, v); return 0; } static const struct seq_operations bond_info_seq_ops = { .start = bond_info_seq_start, .next = bond_info_seq_next, .stop = bond_info_seq_stop, .show = bond_info_seq_show, }; static int bond_info_open(struct inode *inode, struct file *file) { struct seq_file *seq; int res; res = seq_open(file, &bond_info_seq_ops); if (!res) { /* recover the pointer buried in proc_dir_entry data */ seq = file->private_data; seq->private = PDE_DATA(inode); } return res; } static const struct file_operations bond_info_fops = { .owner = THIS_MODULE, .open = bond_info_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, }; void bond_create_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir) { bond->proc_entry = proc_create_data(bond_dev->name, S_IRUGO, bn->proc_dir, &bond_info_fops, bond); if (bond->proc_entry == NULL) netdev_warn(bond_dev, "Cannot create /proc/net/%s/%s\n", DRV_NAME, bond_dev->name); else memcpy(bond->proc_file_name, bond_dev->name, IFNAMSIZ); } } void bond_remove_proc_entry(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); if (bn->proc_dir && bond->proc_entry) { remove_proc_entry(bond->proc_file_name, bn->proc_dir); memset(bond->proc_file_name, 0, IFNAMSIZ); bond->proc_entry = NULL; } } /* Create the bonding directory under /proc/net, if doesn't exist yet. * Caller must hold rtnl_lock. */ void __net_init bond_create_proc_dir(struct bond_net *bn) { if (!bn->proc_dir) { bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net); if (!bn->proc_dir) pr_warn("Warning: Cannot create /proc/net/%s\n", DRV_NAME); } } /* Destroy the bonding directory under /proc/net, if empty. * Caller must hold rtnl_lock. */ void __net_exit bond_destroy_proc_dir(struct bond_net *bn) { if (bn->proc_dir) { remove_proc_entry(DRV_NAME, bn->net->proc_net); bn->proc_dir = NULL; } }
8 3 5 8 8 8 8 10 10 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #include <linux/types.h> #include <linux/ipv6.h> #include <linux/in6.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/icmp.h> #include <linux/sysctl.h> #include <net/ipv6_frag.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_l3proto.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/ipv6/nf_conntrack_ipv6.h> #endif #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/ipv6/nf_defrag_ipv6.h> static DEFINE_MUTEX(defrag6_mutex); static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, struct sk_buff *skb) { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); } #endif if (nf_bridge_in_prerouting(skb)) return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) return IP6_DEFRAG_CONNTRACK_IN + zone_id; else return IP6_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv6_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { int err; #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Previously seen (loopback)? */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif err = nf_ct_frag6_gather(state->net, skb, nf_ct6_defrag_user(state->hook, skb)); /* queued */ if (err == -EINPROGRESS) return NF_STOLEN; return err == 0 ? NF_ACCEPT : NF_DROP; } static const struct nf_hook_ops ipv6_defrag_ops[] = { { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv6_defrag, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, }, }; static void __net_exit defrag6_net_exit(struct net *net) { if (net->nf.defrag_ipv6) { nf_unregister_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); net->nf.defrag_ipv6 = false; } } static struct pernet_operations defrag6_net_ops = { .exit = defrag6_net_exit, }; static int __init nf_defrag_init(void) { int ret = 0; ret = nf_ct_frag6_init(); if (ret < 0) { pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); return ret; } ret = register_pernet_subsys(&defrag6_net_ops); if (ret < 0) { pr_err("nf_defrag_ipv6: can't register pernet ops\n"); goto cleanup_frag6; } return ret; cleanup_frag6: nf_ct_frag6_cleanup(); return ret; } static void __exit nf_defrag_fini(void) { unregister_pernet_subsys(&defrag6_net_ops); nf_ct_frag6_cleanup(); } int nf_defrag_ipv6_enable(struct net *net) { int err = 0; might_sleep(); if (net->nf.defrag_ipv6) return 0; mutex_lock(&defrag6_mutex); if (net->nf.defrag_ipv6) goto out_unlock; err = nf_register_net_hooks(net, ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); if (err == 0) net->nf.defrag_ipv6 = true; out_unlock: mutex_unlock(&defrag6_mutex); return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL");
4 4 4 3 2 2 2 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 /* * Copyright (c) 2003+ Evgeniy Polyakov <zbr@ioremap.net> * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, see <http://www.gnu.org/licenses/>. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/if.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/tcp.h> #include <net/ip.h> #include <net/tcp.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/x_tables.h> #include <net/netfilter/nf_log.h> #include <linux/netfilter/xt_osf.h> struct xt_osf_finger { struct rcu_head rcu_head; struct list_head finger_entry; struct xt_osf_user_finger finger; }; enum osf_fmatch_states { /* Packet does not match the fingerprint */ FMATCH_WRONG = 0, /* Packet matches the fingerprint */ FMATCH_OK, /* Options do not match the fingerprint, but header does */ FMATCH_OPT_WRONG, }; /* * Indexed by dont-fragment bit. * It is the only constant value in the fingerprint. */ static struct list_head xt_osf_fingers[2]; static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = { [OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) }, }; static int xt_osf_add_callback(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const osf_attrs[], struct netlink_ext_ack *extack) { struct xt_osf_user_finger *f; struct xt_osf_finger *kf = NULL, *sf; int err = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; if (!(nlh->nlmsg_flags & NLM_F_CREATE)) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); kf = kmalloc(sizeof(struct xt_osf_finger), GFP_KERNEL); if (!kf) return -ENOMEM; memcpy(&kf->finger, f, sizeof(struct xt_osf_user_finger)); list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) continue; kfree(kf); kf = NULL; if (nlh->nlmsg_flags & NLM_F_EXCL) err = -EEXIST; break; } /* * We are protected by nfnl mutex. */ if (kf) list_add_tail_rcu(&kf->finger_entry, &xt_osf_fingers[!!f->df]); return err; } static int xt_osf_remove_callback(struct net *net, struct sock *ctnl, struct sk_buff *skb, const struct nlmsghdr *nlh, const struct nlattr * const osf_attrs[], struct netlink_ext_ack *extack) { struct xt_osf_user_finger *f; struct xt_osf_finger *sf; int err = -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!osf_attrs[OSF_ATTR_FINGER]) return -EINVAL; f = nla_data(osf_attrs[OSF_ATTR_FINGER]); list_for_each_entry(sf, &xt_osf_fingers[!!f->df], finger_entry) { if (memcmp(&sf->finger, f, sizeof(struct xt_osf_user_finger))) continue; /* * We are protected by nfnl mutex. */ list_del_rcu(&sf->finger_entry); kfree_rcu(sf, rcu_head); err = 0; break; } return err; } static const struct nfnl_callback xt_osf_nfnetlink_callbacks[OSF_MSG_MAX] = { [OSF_MSG_ADD] = { .call = xt_osf_add_callback, .attr_count = OSF_ATTR_MAX, .policy = xt_osf_policy, }, [OSF_MSG_REMOVE] = { .call = xt_osf_remove_callback, .attr_count = OSF_ATTR_MAX, .policy = xt_osf_policy, }, }; static const struct nfnetlink_subsystem xt_osf_nfnetlink = { .name = "osf", .subsys_id = NFNL_SUBSYS_OSF, .cb_count = OSF_MSG_MAX, .cb = xt_osf_nfnetlink_callbacks, }; static inline int xt_osf_ttl(const struct sk_buff *skb, const struct xt_osf_info *info, unsigned char f_ttl) { const struct iphdr *ip = ip_hdr(skb); if (info->flags & XT_OSF_TTL) { if (info->ttl == XT_OSF_TTL_TRUE) return ip->ttl == f_ttl; if (info->ttl == XT_OSF_TTL_NOCHECK) return 1; else if (ip->ttl <= f_ttl) return 1; else { struct in_device *in_dev = __in_dev_get_rcu(skb->dev); int ret = 0; for_ifa(in_dev) { if (inet_ifa_match(ip->saddr, ifa)) { ret = (ip->ttl == f_ttl); break; } } endfor_ifa(in_dev); return ret; } } return ip->ttl == f_ttl; } static bool xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p) { const struct xt_osf_info *info = p->matchinfo; const struct iphdr *ip = ip_hdr(skb); const struct tcphdr *tcp; struct tcphdr _tcph; int fmatch = FMATCH_WRONG, fcount = 0; unsigned int optsize = 0, check_WSS = 0; u16 window, totlen, mss = 0; bool df; const unsigned char *optp = NULL, *_optp = NULL; unsigned char opts[MAX_IPOPTLEN]; const struct xt_osf_finger *kf; const struct xt_osf_user_finger *f; struct net *net = xt_net(p); if (!info) return false; tcp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(struct tcphdr), &_tcph); if (!tcp) return false; if (!tcp->syn) return false; totlen = ntohs(ip->tot_len); df = ntohs(ip->frag_off) & IP_DF; window = ntohs(tcp->window); if (tcp->doff * 4 > sizeof(struct tcphdr)) { optsize = tcp->doff * 4 - sizeof(struct tcphdr); _optp = optp = skb_header_pointer(skb, ip_hdrlen(skb) + sizeof(struct tcphdr), optsize, opts); } list_for_each_entry_rcu(kf, &xt_osf_fingers[df], finger_entry) { int foptsize, optnum; f = &kf->finger; if (!(info->flags & XT_OSF_LOG) && strcmp(info->genre, f->genre)) continue; optp = _optp; fmatch = FMATCH_WRONG; if (totlen != f->ss || !xt_osf_ttl(skb, info, f->ttl)) continue; /* * Should not happen if userspace parser was written correctly. */ if (f->wss.wc >= OSF_WSS_MAX) continue; /* Check options */ foptsize = 0; for (optnum = 0; optnum < f->opt_num; ++optnum) foptsize += f->opt[optnum].length; if (foptsize > MAX_IPOPTLEN || optsize > MAX_IPOPTLEN || optsize != foptsize) continue; check_WSS = f->wss.wc; for (optnum = 0; optnum < f->opt_num; ++optnum) { if (f->opt[optnum].kind == (*optp)) { __u32 len = f->opt[optnum].length; const __u8 *optend = optp + len; fmatch = FMATCH_OK; switch (*optp) { case OSFOPT_MSS: mss = optp[3]; mss <<= 8; mss |= optp[2]; mss = ntohs((__force __be16)mss); break; case OSFOPT_TS: break; } optp = optend; } else fmatch = FMATCH_OPT_WRONG; if (fmatch != FMATCH_OK) break; } if (fmatch != FMATCH_OPT_WRONG) { fmatch = FMATCH_WRONG; switch (check_WSS) { case OSF_WSS_PLAIN: if (f->wss.val == 0 || window == f->wss.val) fmatch = FMATCH_OK; break; case OSF_WSS_MSS: /* * Some smart modems decrease mangle MSS to * SMART_MSS_2, so we check standard, decreased * and the one provided in the fingerprint MSS * values. */ #define SMART_MSS_1 1460 #define SMART_MSS_2 1448 if (window == f->wss.val * mss || window == f->wss.val * SMART_MSS_1 || window == f->wss.val * SMART_MSS_2) fmatch = FMATCH_OK; break; case OSF_WSS_MTU: if (window == f->wss.val * (mss + 40) || window == f->wss.val * (SMART_MSS_1 + 40) || window == f->wss.val * (SMART_MSS_2 + 40)) fmatch = FMATCH_OK; break; case OSF_WSS_MODULO: if ((window % f->wss.val) == 0) fmatch = FMATCH_OK; break; } } if (fmatch != FMATCH_OK) continue; fcount++; if (info->flags & XT_OSF_LOG) nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p), xt_out(p), NULL, "%s [%s:%s] : %pI4:%d -> %pI4:%d hops=%d\n", f->genre, f->version, f->subtype, &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest), f->ttl - ip->ttl); if ((info->flags & XT_OSF_LOG) && info->loglevel == XT_OSF_LOGLEVEL_FIRST) break; } if (!fcount && (info->flags & XT_OSF_LOG)) nf_log_packet(net, xt_family(p), xt_hooknum(p), skb, xt_in(p), xt_out(p), NULL, "Remote OS is not known: %pI4:%u -> %pI4:%u\n", &ip->saddr, ntohs(tcp->source), &ip->daddr, ntohs(tcp->dest)); if (fcount) fmatch = FMATCH_OK; return fmatch == FMATCH_OK; } static struct xt_match xt_osf_match = { .name = "osf", .revision = 0, .family = NFPROTO_IPV4, .proto = IPPROTO_TCP, .hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_FORWARD), .match = xt_osf_match_packet, .matchsize = sizeof(struct xt_osf_info), .me = THIS_MODULE, }; static int __init xt_osf_init(void) { int err = -EINVAL; int i; for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) INIT_LIST_HEAD(&xt_osf_fingers[i]); err = nfnetlink_subsys_register(&xt_osf_nfnetlink); if (err < 0) { pr_err("Failed to register OSF nsfnetlink helper (%d)\n", err); goto err_out_exit; } err = xt_register_match(&xt_osf_match); if (err) { pr_err("Failed to register OS fingerprint " "matching module (%d)\n", err); goto err_out_remove; } return 0; err_out_remove: nfnetlink_subsys_unregister(&xt_osf_nfnetlink); err_out_exit: return err; } static void __exit xt_osf_fini(void) { struct xt_osf_finger *f; int i; nfnetlink_subsys_unregister(&xt_osf_nfnetlink); xt_unregister_match(&xt_osf_match); rcu_read_lock(); for (i=0; i<ARRAY_SIZE(xt_osf_fingers); ++i) { list_for_each_entry_rcu(f, &xt_osf_fingers[i], finger_entry) { list_del_rcu(&f->finger_entry); kfree_rcu(f, rcu_head); } } rcu_read_unlock(); rcu_barrier(); } module_init(xt_osf_init); module_exit(xt_osf_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>"); MODULE_DESCRIPTION("Passive OS fingerprint matching."); MODULE_ALIAS("ipt_osf"); MODULE_ALIAS("ip6t_osf"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/rculist.h> #include <linux/llist.h> #include "rds_single_path.h" #include "ib_mr.h" struct workqueue_struct *rds_ib_mr_wq; static DEFINE_PER_CPU(unsigned long, clean_list_grace); #define CLEAN_LIST_BUSY_BIT 0 static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) { struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; rcu_read_lock(); list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { refcount_inc(&rds_ibdev->refcount); rcu_read_unlock(); return rds_ibdev; } } } rcu_read_unlock(); return NULL; } static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; i_ipaddr = kmalloc(sizeof *i_ipaddr, GFP_KERNEL); if (!i_ipaddr) return -ENOMEM; i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; } static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr; struct rds_ib_ipaddr *to_free = NULL; spin_lock_irq(&rds_ibdev->spinlock); list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { list_del_rcu(&i_ipaddr->list); to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); if (to_free) kfree_rcu(to_free, rcu); } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); if (!rds_ibdev_old) return rds_ib_add_ipaddr(rds_ibdev, ipaddr); if (rds_ibdev_old != rds_ibdev) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); rds_ib_dev_put(rds_ibdev_old); return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } rds_ib_dev_put(rds_ibdev_old); return 0; } void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* conn was previously on the nodev_conns_list */ spin_lock_irq(&ib_nodev_conns_lock); BUG_ON(list_empty(&ib_nodev_conns)); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_lock(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); spin_unlock(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; refcount_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) { struct rds_ib_connection *ic = conn->c_transport_data; /* place conn on nodev_conns_list */ spin_lock(&ib_nodev_conns_lock); spin_lock_irq(&rds_ibdev->spinlock); BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); spin_unlock_irq(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &ib_nodev_conns); spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; rds_ib_dev_put(rds_ibdev); } void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ spin_lock_irq(&ib_nodev_conns_lock); list_splice(&ib_nodev_conns, &tmp_list); spin_unlock_irq(&ib_nodev_conns_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); } void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo) { struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool; iinfo->rdma_mr_max = pool_1m->max_items; iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages; } struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; struct llist_node *ret; unsigned long *flag; preempt_disable(); flag = this_cpu_ptr(&clean_list_grace); set_bit(CLEAN_LIST_BUSY_BIT, flag); ret = llist_del_first(&pool->clean_list); if (ret) { ibmr = llist_entry(ret, struct rds_ib_mr, llnode); if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_reused); else rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); } clear_bit(CLEAN_LIST_BUSY_BIT, flag); preempt_enable(); return ibmr; } static inline void wait_clean_list_grace(void) { int cpu; unsigned long *flag; for_each_online_cpu(cpu) { flag = &per_cpu(clean_list_grace, cpu); while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) cpu_relax(); } } void rds_ib_sync_mr(void *trans_private, int direction) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; switch (direction) { case DMA_FROM_DEVICE: ib_dma_sync_sg_for_cpu(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; case DMA_TO_DEVICE: ib_dma_sync_sg_for_device(rds_ibdev->dev, ibmr->sg, ibmr->sg_dma_len, DMA_BIDIRECTIONAL); break; } } void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { struct rds_ib_device *rds_ibdev = ibmr->device; if (ibmr->sg_dma_len) { ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len, DMA_BIDIRECTIONAL); ibmr->sg_dma_len = 0; } /* Release the s/g list */ if (ibmr->sg_len) { unsigned int i; for (i = 0; i < ibmr->sg_len; ++i) { struct page *page = sg_page(&ibmr->sg[i]); /* FIXME we need a way to tell a r/w MR * from a r/o MR */ WARN_ON(!page->mapping && irqs_disabled()); set_page_dirty(page); put_page(page); } kfree(ibmr->sg); ibmr->sg = NULL; ibmr->sg_len = 0; } } void rds_ib_teardown_mr(struct rds_ib_mr *ibmr) { unsigned int pinned = ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (pinned) { struct rds_ib_mr_pool *pool = ibmr->pool; atomic_sub(pinned, &pool->free_pinned); } } static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int free_all) { unsigned int item_count; item_count = atomic_read(&pool->item_count); if (free_all) return item_count; return 0; } /* * given an llist of mrs, put them all into the list_head for more processing */ static unsigned int llist_append_to_list(struct llist_head *llist, struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; unsigned int count = 0; node = llist_del_all(llist); while (node) { next = node->next; ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; count++; } return count; } /* * this takes a list head of mrs and turns it into linked llist nodes * of clusters. Each cluster has linked llist nodes of * MR_CLUSTER_SIZE mrs that are ready for reuse. */ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, struct list_head *list, struct llist_node **nodes_head, struct llist_node **nodes_tail) { struct rds_ib_mr *ibmr; struct llist_node *cur = NULL; struct llist_node **next = nodes_head; list_for_each_entry(ibmr, list, unmap_list) { cur = &ibmr->llnode; *next = cur; next = &cur->next; } *next = NULL; *nodes_tail = cur; } /* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr; struct llist_node *clean_nodes; struct llist_node *clean_tail; LIST_HEAD(unmap_list); unsigned long unpinned = 0; unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_flush); if (ibmr_ret) { DEFINE_WAIT(wait); while (!mutex_trylock(&pool->flush_lock)) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } prepare_to_wait(&pool->flush_wait, &wait, TASK_UNINTERRUPTIBLE); if (llist_empty(&pool->clean_list)) schedule(); ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; finish_wait(&pool->flush_wait, &wait); goto out_nolock; } } finish_wait(&pool->flush_wait, &wait); } else mutex_lock(&pool->flush_lock); if (ibmr_ret) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) { *ibmr_ret = ibmr; goto out; } } /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) llist_append_to_list(&pool->clean_list, &unmap_list); free_goal = rds_ib_flush_goal(pool, free_all); if (list_empty(&unmap_list)) goto out; if (pool->use_fastreg) rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal); else rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); if (!list_empty(&unmap_list)) { /* we have to make sure that none of the things we're about * to put on the clean list would race with other cpus trying * to pull items off. The llist would explode if we managed to * remove something from the clean list and then add it back again * while another CPU was spinning on that same item in llist_del_first. * * This is pretty unlikely, but just in case wait for an llist grace period * here before adding anything back into the clean list. */ wait_clean_list_grace(); list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail); if (ibmr_ret) { *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); clean_nodes = clean_nodes->next; } /* more than one entry in llist nodes */ if (clean_nodes) llist_add_batch(clean_nodes, clean_tail, &pool->clean_list); } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: mutex_unlock(&pool->flush_lock); if (waitqueue_active(&pool->flush_wait)) wake_up(&pool->flush_wait); out_nolock: return 0; } struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; int iter = 0; while (1) { ibmr = rds_ib_reuse_mr(pool); if (ibmr) return ibmr; if (atomic_inc_return(&pool->item_count) <= pool->max_items) break; atomic_dec(&pool->item_count); if (++iter > 2) { if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); return ERR_PTR(-EAGAIN); } /* We do have some empty MRs. Flush them out. */ if (pool->pool_type == RDS_IB_MR_8K_POOL) rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait); else rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait); rds_ib_flush_mr_pool(pool, 0, &ibmr); if (ibmr) return ibmr; } return ibmr; } static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) { struct rds_ib_mr *ibmr = trans_private; struct rds_ib_mr_pool *pool = ibmr->pool; struct rds_ib_device *rds_ibdev = ibmr->device; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); /* Return it to the pool's free list */ if (rds_ibdev->use_fastreg) rds_ib_free_frmr_list(ibmr); else rds_ib_free_fmr_list(ibmr); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 5) queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10); } } rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { if (rds_ibdev->mr_8k_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_8k_pool, 0, NULL); if (rds_ibdev->mr_1m_pool) rds_ib_flush_mr_pool(rds_ibdev->mr_1m_pool, 0, NULL); } up_read(&rds_ib_devices_lock); } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, struct rds_sock *rs, u32 *key_ret) { struct rds_ib_device *rds_ibdev; struct rds_ib_mr *ibmr = NULL; struct rds_ib_connection *ic = rs->rs_conn->c_transport_data; int ret; rds_ibdev = rds_ib_get_device(rs->rs_bound_addr); if (!rds_ibdev) { ret = -ENODEV; goto out; } if (!rds_ibdev->mr_8k_pool || !rds_ibdev->mr_1m_pool) { ret = -ENODEV; goto out; } if (rds_ibdev->use_fastreg) ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret); else ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret); if (ibmr) rds_ibdev = NULL; out: if (!ibmr) pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret); if (rds_ibdev) rds_ib_dev_put(rds_ibdev); return ibmr; } void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { cancel_delayed_work_sync(&pool->flush_worker); rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int pool_type) { struct rds_ib_mr_pool *pool; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return ERR_PTR(-ENOMEM); pool->pool_type = pool_type; init_llist_head(&pool->free_list); init_llist_head(&pool->drop_list); init_llist_head(&pool->clean_list); mutex_init(&pool->flush_lock); init_waitqueue_head(&pool->flush_wait); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); if (pool_type == RDS_IB_MR_1M_POOL) { /* +1 allows for unaligned MRs */ pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1; pool->max_items = RDS_MR_1M_POOL_SIZE; } else { /* pool_type == RDS_IB_MR_8K_POOL */ pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1; pool->max_items = RDS_MR_8K_POOL_SIZE; } pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; pool->fmr_attr.page_shift = PAGE_SHIFT; pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4; pool->use_fastreg = rds_ibdev->use_fastreg; return pool; } int rds_ib_mr_init(void) { rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; } /* By the time this is called all the IB devices should have been torn down and * had their pools freed. As each pool is freed its work struct is waited on, * so the pool flushing work queue should be idle by the time we get here. */ void rds_ib_mr_exit(void) { destroy_workqueue(rds_ib_mr_wq); }
180 180 197 174 188 188 63 135 23 20 14 14 20 7 23 14 12 9 9 7 27 4 31 119 7 106 19 7 5 13 1 128 113 128 15 15 1 1 10 10 10 10 10 10 10 10 10 10 9 10 10 161 40 8 142 126 142 17 129 129 129 149 16 16 16 16 16 148 105 145 1 145 48 33 48 145 102 102 2 145 17 15 15 134 15 3 3 131 2 129 129 1 37 37 37 1 37 34 3 37 1 35 34 142 142 142 142 142 130 129 130 142 16 1 16 102 43 118 105 105 112 107 104 25 112 3 129 98 129 128 111 111 129 7 129 129 129 128 99 129 129 128 129 129 2 111 111 129 2 2 2 129 129 98 111 111 114 114 116 113 112 113 113 110 26 113 113 113 113 26 113 25 11 113 11 113 101 108 129 129 127 126 114 1 113 127 25 25 110 113 129 80 80 129 126 126 128 126 16 16 62 67 67 41 37 37 33 33 3 38 27 16 16 11 11 60 53 163 163 164 2 163 1 163 4 160 163 36 10 10 153 144 143 78 78 78 154 154 6 154 154 2 154 14 1 147 6 154 154 144 144 144 144 10 10 10 10 10 153 154 154 154 36 36 154 147 7 9 14 1 154 163 148 14 32 3 3 55 53 53 2 53 4 53 53 53 53 37 53 50 3 3 3 3 53 1 52 53 53 8 45 40 5 4 1 1 1 1 1 53 53 53 42 53 164 164 2 163 15 15 15 15 15 15 15 15 15 15 15 172 163 172 172 15 15 172 172 15 15 15 172 170 163 170 15 15 15 170 224 224 15 224 166 203 224 15 15 15 224 222 247 1 247 1 1 246 181 247 247 5 5 247 177 177 219 204 247 125 122 124 125 86 86 81 86 86 84 86 86 86 86 86 197 177 197 197 41 197 189 42 9 42 42 42 42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 /* * inode.c * * PURPOSE * Inode handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * This file is distributed under the terms of the GNU General Public * License (GPL). Copies of the GPL can be obtained from: * ftp://prep.ai.mit.edu/pub/gnu/GPL * Each contributing author retains all rights to their own work. * * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 10/04/98 dgb Added rudimentary directory functions * 10/07/98 Fully working udf_block_map! It works! * 11/25/98 bmap altered to better support extents * 12/06/98 blf partition support in udf_iget, udf_block_map * and udf_read_inode * 12/12/98 rewrote udf_block_map to handle next extents and descs across * block boundaries (which is not actually allowed) * 12/20/98 added support for strategy 4096 * 03/07/99 rewrote udf_block_map (again) * New funcs, inode_bmap, udf_next_aext * 04/19/99 Support for writing device EA's for major/minor # */ #include "udfdecl.h" #include <linux/mm.h> #include <linux/module.h> #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> #include "udf_i.h" #include "udf_sb.h" #define EXTENT_MERGE_SIZE 5 static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); static sector_t inode_getblk(struct inode *, sector_t, int *, int *); static int8_t udf_insert_aext(struct inode *, struct extent_position, struct kernel_lb_addr, uint32_t); static void udf_split_extents(struct inode *, int *, int, int, struct kernel_long_ad *, int *); static void udf_prealloc_extents(struct inode *, int, int, struct kernel_long_ad *, int *); static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *); static void udf_update_extents(struct inode *, struct kernel_long_ad *, int, int, struct extent_position *); static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); static void __udf_clear_extent_cache(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->cached_extent.lstart != -1) { brelse(iinfo->cached_extent.epos.bh); iinfo->cached_extent.lstart = -1; } } /* Invalidate extent cache */ static void udf_clear_extent_cache(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); spin_lock(&iinfo->i_extent_cache_lock); __udf_clear_extent_cache(inode); spin_unlock(&iinfo->i_extent_cache_lock); } /* Return contents of extent cache */ static int udf_read_extent_cache(struct inode *inode, loff_t bcount, loff_t *lbcount, struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); int ret = 0; spin_lock(&iinfo->i_extent_cache_lock); if ((iinfo->cached_extent.lstart <= bcount) && (iinfo->cached_extent.lstart != -1)) { /* Cache hit */ *lbcount = iinfo->cached_extent.lstart; memcpy(pos, &iinfo->cached_extent.epos, sizeof(struct extent_position)); if (pos->bh) get_bh(pos->bh); ret = 1; } spin_unlock(&iinfo->i_extent_cache_lock); return ret; } /* Add extent to extent cache */ static void udf_update_extent_cache(struct inode *inode, loff_t estart, struct extent_position *pos) { struct udf_inode_info *iinfo = UDF_I(inode); spin_lock(&iinfo->i_extent_cache_lock); /* Invalidate previously cached extent */ __udf_clear_extent_cache(inode); if (pos->bh) get_bh(pos->bh); memcpy(&iinfo->cached_extent.epos, pos, sizeof(*pos)); iinfo->cached_extent.lstart = estart; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: iinfo->cached_extent.epos.offset -= sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: iinfo->cached_extent.epos.offset -= sizeof(struct long_ad); break; } spin_unlock(&iinfo->i_extent_cache_lock); } void udf_evict_inode(struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); int want_delete = 0; if (!is_bad_inode(inode)) { if (!inode->i_nlink) { want_delete = 1; udf_setsize(inode, 0); udf_update_inode(inode, IS_SYNC(inode)); } if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { udf_warn(inode->i_sb, "Inode %lu (mode %o) has inode size %llu different from extent length %llu. Filesystem need not be standards compliant.\n", inode->i_ino, inode->i_mode, (unsigned long long)inode->i_size, (unsigned long long)iinfo->i_lenExtents); } } truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); kfree(iinfo->i_ext.i_data); iinfo->i_ext.i_data = NULL; udf_clear_extent_cache(inode); if (want_delete) { udf_free_inode(inode); } } static void udf_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; struct udf_inode_info *iinfo = UDF_I(inode); loff_t isize = inode->i_size; if (to > isize) { truncate_pagecache(inode, isize); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } } } static int udf_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, udf_get_block, wbc); } static int udf_writepages(struct address_space *mapping, struct writeback_control *wbc) { return mpage_writepages(mapping, wbc, udf_get_block); } static int udf_readpage(struct file *file, struct page *page) { return mpage_readpage(page, udf_get_block); } static int udf_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { return mpage_readpages(mapping, pages, nr_pages, udf_get_block); } static int udf_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int ret; ret = block_write_begin(mapping, pos, len, flags, pagep, udf_get_block); if (unlikely(ret)) udf_write_failed(mapping, pos + len); return ret; } static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; ret = blockdev_direct_IO(iocb, inode, iter, udf_get_block); if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) udf_write_failed(mapping, iocb->ki_pos + count); return ret; } static sector_t udf_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, udf_get_block); } const struct address_space_operations udf_aops = { .readpage = udf_readpage, .readpages = udf_readpages, .writepage = udf_writepage, .writepages = udf_writepages, .write_begin = udf_write_begin, .write_end = generic_write_end, .direct_IO = udf_direct_IO, .bmap = udf_bmap, }; /* * Expand file stored in ICB to a normal one-block-file * * This function requires i_data_sem for writing and releases it. * This function requires i_mutex held */ int udf_expand_file_adinicb(struct inode *inode) { struct page *page; char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); int err; WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; up_write(&iinfo->i_data_sem); mark_inode_dirty(inode); return 0; } /* * Release i_data_sem so that we can lock a page - page lock ranks * above i_data_sem. i_mutex still protects us against file changes. */ up_write(&iinfo->i_data_sem); page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS); if (!page) return -ENOMEM; if (!PageUptodate(page)) { kaddr = kmap_atomic(page); memset(kaddr + iinfo->i_lenAlloc, 0x00, PAGE_SIZE - iinfo->i_lenAlloc); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, iinfo->i_lenAlloc); flush_dcache_page(page); SetPageUptodate(page); kunmap_atomic(kaddr); } down_write(&iinfo->i_data_sem); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; /* from now on we have normal address_space methods */ inode->i_data.a_ops = &udf_aops; set_page_dirty(page); unlock_page(page); up_write(&iinfo->i_data_sem); err = filemap_fdatawrite(inode->i_mapping); if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); down_write(&iinfo->i_data_sem); kaddr = kmap_atomic(page); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); kunmap_atomic(kaddr); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; iinfo->i_lenAlloc = inode->i_size; up_write(&iinfo->i_data_sem); } put_page(page); mark_inode_dirty(inode); return err; } struct buffer_head *udf_expand_dir_adinicb(struct inode *inode, int *block, int *err) { int newblock; struct buffer_head *dbh = NULL; struct kernel_lb_addr eloc; uint8_t alloctype; struct extent_position epos; struct udf_fileident_bh sfibh, dfibh; loff_t f_pos = udf_ext0_offset(inode); int size = udf_ext0_offset(inode) + inode->i_size; struct fileIdentDesc cfi, *sfi, *dfi; struct udf_inode_info *iinfo = UDF_I(inode); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) alloctype = ICBTAG_FLAG_AD_SHORT; else alloctype = ICBTAG_FLAG_AD_LONG; if (!inode->i_size) { iinfo->i_alloc_type = alloctype; mark_inode_dirty(inode); return NULL; } /* alloc block, and copy data to it */ *block = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, iinfo->i_location.logicalBlockNum, err); if (!(*block)) return NULL; newblock = udf_get_pblock(inode->i_sb, *block, iinfo->i_location.partitionReferenceNum, 0); if (!newblock) return NULL; dbh = udf_tgetblk(inode->i_sb, newblock); if (!dbh) return NULL; lock_buffer(dbh); memset(dbh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(dbh); unlock_buffer(dbh); mark_buffer_dirty_inode(dbh, inode); sfibh.soffset = sfibh.eoffset = f_pos & (inode->i_sb->s_blocksize - 1); sfibh.sbh = sfibh.ebh = NULL; dfibh.soffset = dfibh.eoffset = 0; dfibh.sbh = dfibh.ebh = dbh; while (f_pos < size) { iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; sfi = udf_fileident_read(inode, &f_pos, &sfibh, &cfi, NULL, NULL, NULL, NULL); if (!sfi) { brelse(dbh); return NULL; } iinfo->i_alloc_type = alloctype; sfi->descTag.tagLocation = cpu_to_le32(*block); dfibh.soffset = dfibh.eoffset; dfibh.eoffset += (sfibh.eoffset - sfibh.soffset); dfi = (struct fileIdentDesc *)(dbh->b_data + dfibh.soffset); if (udf_write_fi(inode, sfi, dfi, &dfibh, sfi->impUse, sfi->fileIdent + le16_to_cpu(sfi->lengthOfImpUse))) { iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; brelse(dbh); return NULL; } } mark_buffer_dirty_inode(dbh, inode); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0, iinfo->i_lenAlloc); iinfo->i_lenAlloc = 0; eloc.logicalBlockNum = *block; eloc.partitionReferenceNum = iinfo->i_location.partitionReferenceNum; iinfo->i_lenExtents = inode->i_size; epos.bh = NULL; epos.block = iinfo->i_location; epos.offset = udf_file_entry_alloc_offset(inode); udf_add_aext(inode, &epos, &eloc, inode->i_size, 0); /* UniqueID stuff */ brelse(epos.bh); mark_inode_dirty(inode); return dbh; } static int udf_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { int err, new; sector_t phys = 0; struct udf_inode_info *iinfo; if (!create) { phys = udf_block_map(inode, block); if (phys) map_bh(bh_result, inode->i_sb, phys); return 0; } err = -EIO; new = 0; iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); if (block == iinfo->i_next_alloc_block + 1) { iinfo->i_next_alloc_block++; iinfo->i_next_alloc_goal++; } /* * Block beyond EOF and prealloc extents? Just discard preallocation * as it is not useful and complicates things. */ if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents) udf_discard_prealloc(inode); udf_clear_extent_cache(inode); phys = inode_getblk(inode, block, &err, &new); if (!phys) goto abort; if (new) set_buffer_new(bh_result); map_bh(bh_result, inode->i_sb, phys); abort: up_write(&iinfo->i_data_sem); return err; } static struct buffer_head *udf_getblk(struct inode *inode, long block, int create, int *err) { struct buffer_head *bh; struct buffer_head dummy; dummy.b_state = 0; dummy.b_blocknr = -1000; *err = udf_get_block(inode, block, &dummy, create); if (!*err && buffer_mapped(&dummy)) { bh = sb_getblk(inode->i_sb, dummy.b_blocknr); if (buffer_new(&dummy)) { lock_buffer(bh); memset(bh->b_data, 0x00, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); } return bh; } return NULL; } /* Extend the file with new blocks totaling 'new_block_bytes', * return the number of extents added */ static int udf_do_extend_file(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, loff_t new_block_bytes) { uint32_t add; int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK); struct super_block *sb = inode->i_sb; struct udf_inode_info *iinfo; int err; /* The previous extent is fake and we should not extend by anything * - there's nothing to do... */ if (!new_block_bytes && fake) return 0; iinfo = UDF_I(inode); /* Round the last extent up to a multiple of block size */ if (last_ext->extLength & (sb->s_blocksize - 1)) { last_ext->extLength = (last_ext->extLength & UDF_EXTENT_FLAG_MASK) | (((last_ext->extLength & UDF_EXTENT_LENGTH_MASK) + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1)); iinfo->i_lenExtents = (iinfo->i_lenExtents + sb->s_blocksize - 1) & ~(sb->s_blocksize - 1); } /* Can we merge with the previous extent? */ if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) == EXT_NOT_RECORDED_NOT_ALLOCATED) { add = (1 << 30) - sb->s_blocksize - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); if (add > new_block_bytes) add = new_block_bytes; new_block_bytes -= add; last_ext->extLength += add; } if (fake) { udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); count++; } else { struct kernel_lb_addr tmploc; uint32_t tmplen; udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); /* * We've rewritten the last extent. If we are going to add * more extents, we may need to enter possible following * empty indirect extent. */ if (new_block_bytes) udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); } /* Managed to do everything necessary? */ if (!new_block_bytes) goto out; /* All further extents will be NOT_RECORDED_NOT_ALLOCATED */ last_ext->extLocation.logicalBlockNum = 0; last_ext->extLocation.partitionReferenceNum = 0; add = (1 << 30) - sb->s_blocksize; last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | add; /* Create enough extents to cover the whole hole */ while (new_block_bytes > add) { new_block_bytes -= add; err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) return err; count++; } if (new_block_bytes) { last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | new_block_bytes; err = udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); if (err) return err; count++; } out: /* last_pos should point to the last written extent... */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) last_pos->offset -= sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) last_pos->offset -= sizeof(struct long_ad); else return -EIO; return count; } /* Extend the final block of the file to final_block_len bytes */ static void udf_do_extend_final_block(struct inode *inode, struct extent_position *last_pos, struct kernel_long_ad *last_ext, uint32_t new_elen) { uint32_t added_bytes; /* * Extent already large enough? It may be already rounded up to block * size... */ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK)) return; added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK); last_ext->extLength += added_bytes; UDF_I(inode)->i_lenExtents += added_bytes; udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); } static int udf_extend_file(struct inode *inode, loff_t newsize) { struct extent_position epos; struct kernel_lb_addr eloc; uint32_t elen; int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = newsize >> sb->s_blocksize_bits, offset; loff_t new_elen; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); struct kernel_long_ad extent; int err = 0; bool within_last_ext; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); /* * When creating hole in file, just don't bother with preserving * preallocation. It likely won't be very useful anyway. */ udf_discard_prealloc(inode); etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); within_last_ext = (etype != -1); /* We don't expect extents past EOF... */ WARN_ON_ONCE(within_last_ext && elen > ((loff_t)offset + 1) << inode->i_blkbits); if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) || (epos.bh && epos.offset == sizeof(struct allocExtDesc))) { /* File has no extents at all or has empty last * indirect extent! Create a fake extent... */ extent.extLocation.logicalBlockNum = 0; extent.extLocation.partitionReferenceNum = 0; extent.extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; } else { epos.offset -= adsize; etype = udf_next_aext(inode, &epos, &extent.extLocation, &extent.extLength, 0); extent.extLength |= etype << 30; } new_elen = ((loff_t)offset << inode->i_blkbits) | (newsize & (sb->s_blocksize - 1)); /* File has extent covering the new size (could happen when extending * inside a block)? */ if (within_last_ext) { /* Extending file within the last file block */ udf_do_extend_final_block(inode, &epos, &extent, new_elen); } else { err = udf_do_extend_file(inode, &epos, &extent, new_elen); } if (err < 0) goto out; err = 0; iinfo->i_lenExtents = newsize; out: brelse(epos.bh); return err; } static sector_t inode_getblk(struct inode *inode, sector_t block, int *err, int *new) { struct kernel_long_ad laarr[EXTENT_MERGE_SIZE]; struct extent_position prev_epos, cur_epos, next_epos; int count = 0, startnum = 0, endnum = 0; uint32_t elen = 0, tmpelen; struct kernel_lb_addr eloc, tmpeloc; int c = 1; loff_t lbcount = 0, b_off = 0; uint32_t newblocknum, newblock; sector_t offset = 0; int8_t etype; struct udf_inode_info *iinfo = UDF_I(inode); int goal = 0, pgoal = iinfo->i_location.logicalBlockNum; int lastblock = 0; bool isBeyondEOF; *err = 0; *new = 0; prev_epos.offset = udf_file_entry_alloc_offset(inode); prev_epos.block = iinfo->i_location; prev_epos.bh = NULL; cur_epos = next_epos = prev_epos; b_off = (loff_t)block << inode->i_sb->s_blocksize_bits; /* find the extent which contains the block we are looking for. alternate between laarr[0] and laarr[1] for locations of the current extent, and the previous extent */ do { if (prev_epos.bh != cur_epos.bh) { brelse(prev_epos.bh); get_bh(cur_epos.bh); prev_epos.bh = cur_epos.bh; } if (cur_epos.bh != next_epos.bh) { brelse(cur_epos.bh); get_bh(next_epos.bh); cur_epos.bh = next_epos.bh; } lbcount += elen; prev_epos.block = cur_epos.block; cur_epos.block = next_epos.block; prev_epos.offset = cur_epos.offset; cur_epos.offset = next_epos.offset; etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 1); if (etype == -1) break; c = !c; laarr[c].extLength = (etype << 30) | elen; laarr[c].extLocation = eloc; if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) pgoal = eloc.logicalBlockNum + ((elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); count++; } while (lbcount + elen <= b_off); b_off -= lbcount; offset = b_off >> inode->i_sb->s_blocksize_bits; /* * Move prev_epos and cur_epos into indirect extent if we are at * the pointer to it */ udf_next_aext(inode, &prev_epos, &tmpeloc, &tmpelen, 0); udf_next_aext(inode, &cur_epos, &tmpeloc, &tmpelen, 0); /* if the extent is allocated and recorded, return the block if the extent is not a multiple of the blocksize, round up */ if (etype == (EXT_RECORDED_ALLOCATED >> 30)) { if (elen & (inode->i_sb->s_blocksize - 1)) { elen = EXT_RECORDED_ALLOCATED | ((elen + inode->i_sb->s_blocksize - 1) & ~(inode->i_sb->s_blocksize - 1)); udf_write_aext(inode, &cur_epos, &eloc, elen, 1); } newblock = udf_get_lb_pblock(inode->i_sb, &eloc, offset); goto out_free; } /* Are we beyond EOF and preallocated extent? */ if (etype == -1) { int ret; loff_t hole_len; isBeyondEOF = true; if (count) { if (c) laarr[0] = laarr[1]; startnum = 1; } else { /* Create a fake extent when there's not one */ memset(&laarr[0].extLocation, 0x00, sizeof(struct kernel_lb_addr)); laarr[0].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED; /* Will udf_do_extend_file() create real extent from a fake one? */ startnum = (offset > 0); } /* Create extents for the hole between EOF and offset */ hole_len = (loff_t)offset << inode->i_blkbits; ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len); if (ret < 0) { *err = ret; newblock = 0; goto out_free; } c = 0; offset = 0; count += ret; /* We are not covered by a preallocated extent? */ if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) != EXT_NOT_RECORDED_ALLOCATED) { /* Is there any real extent? - otherwise we overwrite * the fake one... */ if (count) c = !c; laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | inode->i_sb->s_blocksize; memset(&laarr[c].extLocation, 0x00, sizeof(struct kernel_lb_addr)); count++; } endnum = c + 1; lastblock = 1; } else { isBeyondEOF = false; endnum = startnum = ((count > 2) ? 2 : count); /* if the current extent is in position 0, swap it with the previous */ if (!c && count != 1) { laarr[2] = laarr[0]; laarr[0] = laarr[1]; laarr[1] = laarr[2]; c = 1; } /* if the current block is located in an extent, read the next extent */ etype = udf_next_aext(inode, &next_epos, &eloc, &elen, 0); if (etype != -1) { laarr[c + 1].extLength = (etype << 30) | elen; laarr[c + 1].extLocation = eloc; count++; startnum++; endnum++; } else lastblock = 1; } /* if the current extent is not recorded but allocated, get the * block in the extent corresponding to the requested block */ if ((laarr[c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) newblocknum = laarr[c].extLocation.logicalBlockNum + offset; else { /* otherwise, allocate a new block */ if (iinfo->i_next_alloc_block == block) goal = iinfo->i_next_alloc_goal; if (!goal) { if (!(goal = pgoal)) /* XXX: what was intended here? */ goal = iinfo->i_location.logicalBlockNum + 1; } newblocknum = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, goal, err); if (!newblocknum) { *err = -ENOSPC; newblock = 0; goto out_free; } if (isBeyondEOF) iinfo->i_lenExtents += inode->i_sb->s_blocksize; } /* if the extent the requsted block is located in contains multiple * blocks, split the extent into at most three extents. blocks prior * to requested block, requested block, and blocks after requested * block */ udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); /* We preallocate blocks only for regular files. It also makes sense * for directories but there's a problem when to drop the * preallocation. We might use some delayed work for that but I feel * it's overengineering for a filesystem like UDF. */ if (S_ISREG(inode->i_mode)) udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); /* merge any continuous blocks in laarr */ udf_merge_extents(inode, laarr, &endnum); /* write back the new extents, inserting new extents if the new number * of extents is greater than the old number, and deleting extents if * the new number of extents is less than the old number */ udf_update_extents(inode, laarr, startnum, endnum, &prev_epos); newblock = udf_get_pblock(inode->i_sb, newblocknum, iinfo->i_location.partitionReferenceNum, 0); if (!newblock) { *err = -EIO; goto out_free; } *new = 1; iinfo->i_next_alloc_block = block; iinfo->i_next_alloc_goal = newblocknum; inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); out_free: brelse(prev_epos.bh); brelse(cur_epos.bh); brelse(next_epos.bh); return newblock; } static void udf_split_extents(struct inode *inode, int *c, int offset, int newblocknum, struct kernel_long_ad *laarr, int *endnum) { unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; if ((laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30) || (laarr[*c].extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { int curr = *c; int blen = ((laarr[curr].extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits; int8_t etype = (laarr[curr].extLength >> 30); if (blen == 1) ; else if (!offset || blen == offset + 1) { laarr[curr + 2] = laarr[curr + 1]; laarr[curr + 1] = laarr[curr]; } else { laarr[curr + 3] = laarr[curr + 1]; laarr[curr + 2] = laarr[curr + 1] = laarr[curr]; } if (offset) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, &laarr[curr].extLocation, 0, offset); laarr[curr].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED | (offset << blocksize_bits); laarr[curr].extLocation.logicalBlockNum = 0; laarr[curr].extLocation. partitionReferenceNum = 0; } else laarr[curr].extLength = (etype << 30) | (offset << blocksize_bits); curr++; (*c)++; (*endnum)++; } laarr[curr].extLocation.logicalBlockNum = newblocknum; if (etype == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) laarr[curr].extLocation.partitionReferenceNum = UDF_I(inode)->i_location.partitionReferenceNum; laarr[curr].extLength = EXT_RECORDED_ALLOCATED | blocksize; curr++; if (blen != offset + 1) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) laarr[curr].extLocation.logicalBlockNum += offset + 1; laarr[curr].extLength = (etype << 30) | ((blen - (offset + 1)) << blocksize_bits); curr++; (*endnum)++; } } } static void udf_prealloc_extents(struct inode *inode, int c, int lastblock, struct kernel_long_ad *laarr, int *endnum) { int start, length = 0, currlength = 0, i; if (*endnum >= (c + 1)) { if (!lastblock) return; else start = c; } else { if ((laarr[c + 1].extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { start = c + 1; length = currlength = (((laarr[c + 1].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); } else start = c; } for (i = start + 1; i <= *endnum; i++) { if (i == *endnum) { if (lastblock) length += UDF_DEFAULT_PREALLOC_BLOCKS; } else if ((laarr[i].extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) { length += (((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); } else break; } if (length) { int next = laarr[start].extLocation.logicalBlockNum + (((laarr[start].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits); int numalloc = udf_prealloc_blocks(inode->i_sb, inode, laarr[start].extLocation.partitionReferenceNum, next, (UDF_DEFAULT_PREALLOC_BLOCKS > length ? length : UDF_DEFAULT_PREALLOC_BLOCKS) - currlength); if (numalloc) { if (start == (c + 1)) laarr[start].extLength += (numalloc << inode->i_sb->s_blocksize_bits); else { memmove(&laarr[c + 2], &laarr[c + 1], sizeof(struct long_ad) * (*endnum - (c + 1))); (*endnum)++; laarr[c + 1].extLocation.logicalBlockNum = next; laarr[c + 1].extLocation.partitionReferenceNum = laarr[c].extLocation. partitionReferenceNum; laarr[c + 1].extLength = EXT_NOT_RECORDED_ALLOCATED | (numalloc << inode->i_sb->s_blocksize_bits); start = c + 1; } for (i = start + 1; numalloc && i < *endnum; i++) { int elen = ((laarr[i].extLength & UDF_EXTENT_LENGTH_MASK) + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (elen > numalloc) { laarr[i].extLength -= (numalloc << inode->i_sb->s_blocksize_bits); numalloc = 0; } else { numalloc -= elen; if (*endnum > (i + 1)) memmove(&laarr[i], &laarr[i + 1], sizeof(struct long_ad) * (*endnum - (i + 1))); i--; (*endnum)--; } } UDF_I(inode)->i_lenExtents += numalloc << inode->i_sb->s_blocksize_bits; } } } static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr, int *endnum) { int i; unsigned long blocksize = inode->i_sb->s_blocksize; unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; for (i = 0; i < (*endnum - 1); i++) { struct kernel_long_ad *li /*l[i]*/ = &laarr[i]; struct kernel_long_ad *lip1 /*l[i plus 1]*/ = &laarr[i + 1]; if (((li->extLength >> 30) == (lip1->extLength >> 30)) && (((li->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) || ((lip1->extLocation.logicalBlockNum - li->extLocation.logicalBlockNum) == (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits)))) { if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { lip1->extLength = (lip1->extLength - (li->extLength & UDF_EXTENT_LENGTH_MASK) + UDF_EXTENT_LENGTH_MASK) & ~(blocksize - 1); li->extLength = (li->extLength & UDF_EXTENT_FLAG_MASK) + (UDF_EXTENT_LENGTH_MASK + 1) - blocksize; lip1->extLocation.logicalBlockNum = li->extLocation.logicalBlockNum + ((li->extLength & UDF_EXTENT_LENGTH_MASK) >> blocksize_bits); } else { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; } } else if (((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) && ((lip1->extLength >> 30) == (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30))) { udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); li->extLocation.logicalBlockNum = 0; li->extLocation.partitionReferenceNum = 0; if (((li->extLength & UDF_EXTENT_LENGTH_MASK) + (lip1->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) { lip1->extLength = (lip1->extLength - (li->extLength & UDF_EXTENT_LENGTH_MASK) + UDF_EXTENT_LENGTH_MASK) & ~(blocksize - 1); li->extLength = (li->extLength & UDF_EXTENT_FLAG_MASK) + (UDF_EXTENT_LENGTH_MASK + 1) - blocksize; } else { li->extLength = lip1->extLength + (((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) & ~(blocksize - 1)); if (*endnum > (i + 2)) memmove(&laarr[i + 1], &laarr[i + 2], sizeof(struct long_ad) * (*endnum - (i + 2))); i--; (*endnum)--; } } else if ((li->extLength >> 30) == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, &li->extLocation, 0, ((li->extLength & UDF_EXTENT_LENGTH_MASK) + blocksize - 1) >> blocksize_bits); li->extLocation.logicalBlockNum = 0; li->extLocation.partitionReferenceNum = 0; li->extLength = (li->extLength & UDF_EXTENT_LENGTH_MASK) | EXT_NOT_RECORDED_NOT_ALLOCATED; } } } static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr, int startnum, int endnum, struct extent_position *epos) { int start = 0, i; struct kernel_lb_addr tmploc; uint32_t tmplen; if (startnum > endnum) { for (i = 0; i < (startnum - endnum); i++) udf_delete_aext(inode, *epos); } else if (startnum < endnum) { for (i = 0; i < (endnum - startnum); i++) { udf_insert_aext(inode, *epos, laarr[i].extLocation, laarr[i].extLength); udf_next_aext(inode, epos, &laarr[i].extLocation, &laarr[i].extLength, 1); start++; } } for (i = start; i < endnum; i++) { udf_next_aext(inode, epos, &tmploc, &tmplen, 0); udf_write_aext(inode, epos, &laarr[i].extLocation, laarr[i].extLength, 1); } } struct buffer_head *udf_bread(struct inode *inode, int block, int create, int *err) { struct buffer_head *bh = NULL; bh = udf_getblk(inode, block, create, err); if (!bh) return NULL; if (buffer_uptodate(bh)) return bh; ll_rw_block(REQ_OP_READ, 0, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; brelse(bh); *err = -EIO; return NULL; } int udf_setsize(struct inode *inode, loff_t newsize) { int err; struct udf_inode_info *iinfo; int bsize = i_blocksize(inode); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return -EINVAL; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; iinfo = UDF_I(inode); if (newsize > inode->i_size) { down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { if (bsize < (udf_file_entry_alloc_offset(inode) + newsize)) { err = udf_expand_file_adinicb(inode); if (err) return err; down_write(&iinfo->i_data_sem); } else { iinfo->i_lenAlloc = newsize; goto set_size; } } err = udf_extend_file(inode, newsize); if (err) { up_write(&iinfo->i_data_sem); return err; } set_size: up_write(&iinfo->i_data_sem); truncate_setsize(inode, newsize); } else { if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr + newsize, 0x00, bsize - newsize - udf_file_entry_alloc_offset(inode)); iinfo->i_lenAlloc = newsize; truncate_setsize(inode, newsize); up_write(&iinfo->i_data_sem); goto update_time; } err = block_truncate_page(inode->i_mapping, newsize, udf_get_block); if (err) return err; truncate_setsize(inode, newsize); down_write(&iinfo->i_data_sem); udf_clear_extent_cache(inode); udf_truncate_extents(inode); up_write(&iinfo->i_data_sem); } update_time: inode->i_mtime = inode->i_ctime = current_time(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else mark_inode_dirty(inode); return 0; } /* * Maximum length of linked list formed by ICB hierarchy. The chosen number is * arbitrary - just that we hopefully don't limit any real use of rewritten * inode on write-once media but avoid looping for too long on corrupted media. */ #define UDF_MAX_ICB_NESTING 1024 static int udf_read_inode(struct inode *inode, bool hidden_inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); struct kernel_lb_addr *iloc = &iinfo->i_location; unsigned int link_count; unsigned int indirections = 0; int bs = inode->i_sb->s_blocksize; int ret = -EIO; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { udf_debug("partition reference: %d > logical volume partitions: %d\n", iloc->partitionReferenceNum, sbi->s_partitions); return -EIO; } if (iloc->logicalBlockNum >= sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { udf_debug("block=%d, partition=%d out of range\n", iloc->logicalBlockNum, iloc->partitionReferenceNum); return -EIO; } /* * Set defaults, but the inode is still incomplete! * Note: get_new_inode() sets the following on a new inode: * i_sb = sb * i_no = ino * i_flags = sb->s_flags * i_state = 0 * clean_inode(): zero fills and sets * i_count = 1 * i_nlink = 1 * i_op = NULL; */ bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", inode->i_ino, ident); goto out; } fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); if (ie->indirectICB.extLength) { brelse(ibh); memcpy(&iinfo->i_location, &loc, sizeof(struct kernel_lb_addr)); if (++indirections > UDF_MAX_ICB_NESTING) { udf_err(inode->i_sb, "too many ICBs in ICB hierarchy" " (max %d supported)\n", UDF_MAX_ICB_NESTING); goto out; } brelse(bh); goto reread; } } brelse(ibh); } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { udf_err(inode->i_sb, "unsupported strategy type: %d\n", le16_to_cpu(fe->icbTag.strategyType)); goto out; } if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ iinfo->i_strat4096 = 1; iinfo->i_alloc_type = le16_to_cpu(fe->icbTag.flags) & ICBTAG_FLAG_AD_MASK; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_SHORT && iinfo->i_alloc_type != ICBTAG_FLAG_AD_LONG && iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { ret = -EIO; goto out; } iinfo->i_unique = 0; iinfo->i_lenEAttr = 0; iinfo->i_lenExtents = 0; iinfo->i_lenAlloc = 0; iinfo->i_next_alloc_block = 0; iinfo->i_next_alloc_goal = 0; if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { iinfo->i_efe = 1; iinfo->i_use = 0; ret = udf_alloc_i_data(inode, bs - sizeof(struct extendedFileEntry)); if (ret) goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct extendedFileEntry), bs - sizeof(struct extendedFileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { iinfo->i_efe = 0; iinfo->i_use = 0; ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry)); if (ret) goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct fileEntry), bs - sizeof(struct fileEntry)); } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) { iinfo->i_efe = 0; iinfo->i_use = 1; iinfo->i_lenAlloc = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)-> lengthAllocDescs); ret = udf_alloc_i_data(inode, bs - sizeof(struct unallocSpaceEntry)); if (ret) goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct unallocSpaceEntry), bs - sizeof(struct unallocSpaceEntry)); return 0; } ret = -EIO; read_lock(&sbi->s_cred_lock); i_uid_write(inode, le32_to_cpu(fe->uid)); if (!uid_valid(inode->i_uid) || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_IGNORE) || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_SET)) inode->i_uid = UDF_SB(inode->i_sb)->s_uid; i_gid_write(inode, le32_to_cpu(fe->gid)); if (!gid_valid(inode->i_gid) || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_IGNORE) || UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_SET)) inode->i_gid = UDF_SB(inode->i_sb)->s_gid; if (fe->icbTag.fileType != ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_fmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_fmode; else if (fe->icbTag.fileType == ICBTAG_FILE_TYPE_DIRECTORY && sbi->s_dmode != UDF_INVALID_MODE) inode->i_mode = sbi->s_dmode; else inode->i_mode = udf_convert_permissions(fe); inode->i_mode &= ~sbi->s_umask; read_unlock(&sbi->s_cred_lock); link_count = le16_to_cpu(fe->fileLinkCount); if (!link_count) { if (!hidden_inode) { ret = -ESTALE; goto out; } link_count = 1; } set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); iinfo->i_lenExtents = inode->i_size; if (iinfo->i_efe == 0) { inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime)) inode->i_atime = sbi->s_record_time; if (!udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime)) inode->i_mtime = sbi->s_record_time; if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime)) inode->i_ctime = sbi->s_record_time; iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(fe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(fe->checkpoint); } else { inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime)) inode->i_atime = sbi->s_record_time; if (!udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime)) inode->i_mtime = sbi->s_record_time; if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime)) iinfo->i_crtime = sbi->s_record_time; if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime)) inode->i_ctime = sbi->s_record_time; iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); } inode->i_generation = iinfo->i_unique; /* * Sanity check length of allocation descriptors and extended attrs to * avoid integer overflows */ if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs) goto out; /* Now do exact checks */ if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs) goto out; /* Sanity checks for files in ICB so that we don't get confused later */ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { /* * For file in ICB data is stored in allocation descriptor * so sizes should match */ if (iinfo->i_lenAlloc != inode->i_size) goto out; /* File in ICB has to fit in there... */ if (inode->i_size > bs - udf_file_entry_alloc_offset(inode)) goto out; } switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: inode->i_op = &udf_dir_inode_operations; inode->i_fop = &udf_dir_operations; inode->i_mode |= S_IFDIR; inc_nlink(inode); break; case ICBTAG_FILE_TYPE_REALTIME: case ICBTAG_FILE_TYPE_REGULAR: case ICBTAG_FILE_TYPE_UNDEF: case ICBTAG_FILE_TYPE_VAT20: if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) inode->i_data.a_ops = &udf_adinicb_aops; else inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; inode->i_mode |= S_IFREG; break; case ICBTAG_FILE_TYPE_BLOCK: inode->i_mode |= S_IFBLK; break; case ICBTAG_FILE_TYPE_CHAR: inode->i_mode |= S_IFCHR; break; case ICBTAG_FILE_TYPE_FIFO: init_special_inode(inode, inode->i_mode | S_IFIFO, 0); break; case ICBTAG_FILE_TYPE_SOCKET: init_special_inode(inode, inode->i_mode | S_IFSOCK, 0); break; case ICBTAG_FILE_TYPE_SYMLINK: inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); inode->i_mode = S_IFLNK | 0777; break; case ICBTAG_FILE_TYPE_MAIN: udf_debug("METADATA FILE-----\n"); break; case ICBTAG_FILE_TYPE_MIRROR: udf_debug("METADATA MIRROR FILE-----\n"); break; case ICBTAG_FILE_TYPE_BITMAP: udf_debug("METADATA BITMAP FILE-----\n"); break; default: udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", inode->i_ino, fe->icbTag.fileType); goto out; } if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (dsea) { init_special_inode(inode, inode->i_mode, MKDEV(le32_to_cpu(dsea->majorDeviceIdent), le32_to_cpu(dsea->minorDeviceIdent))); /* Developer ID ??? */ } else goto out; } ret = 0; out: brelse(bh); return ret; } static int udf_alloc_i_data(struct inode *inode, size_t size) { struct udf_inode_info *iinfo = UDF_I(inode); iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); if (!iinfo->i_ext.i_data) return -ENOMEM; return 0; } static umode_t udf_convert_permissions(struct fileEntry *fe) { umode_t mode; uint32_t permissions; uint32_t flags; permissions = le32_to_cpu(fe->permissions); flags = le16_to_cpu(fe->icbTag.flags); mode = ((permissions) & 0007) | ((permissions >> 2) & 0070) | ((permissions >> 4) & 0700) | ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); return mode; } int udf_write_inode(struct inode *inode, struct writeback_control *wbc) { return udf_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } static int udf_sync_inode(struct inode *inode) { return udf_update_inode(inode, 1); } static void udf_adjust_time(struct udf_inode_info *iinfo, struct timespec time) { if (iinfo->i_crtime.tv_sec > time.tv_sec || (iinfo->i_crtime.tv_sec == time.tv_sec && iinfo->i_crtime.tv_nsec > time.tv_nsec)) iinfo->i_crtime = time; } static int udf_update_inode(struct inode *inode, int do_sync) { struct buffer_head *bh = NULL; struct fileEntry *fe; struct extendedFileEntry *efe; uint64_t lb_recorded; uint32_t udfperms; uint16_t icbflags; uint16_t crclen; int err = 0; struct udf_sb_info *sbi = UDF_SB(inode->i_sb); unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; struct udf_inode_info *iinfo = UDF_I(inode); bh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { udf_debug("getblk failure\n"); return -EIO; } lock_buffer(bh); memset(bh->b_data, 0, inode->i_sb->s_blocksize); fe = (struct fileEntry *)bh->b_data; efe = (struct extendedFileEntry *)bh->b_data; if (iinfo->i_use) { struct unallocSpaceEntry *use = (struct unallocSpaceEntry *)bh->b_data; use->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); memcpy(bh->b_data + sizeof(struct unallocSpaceEntry), iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); crclen = sizeof(struct unallocSpaceEntry); goto finish; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) fe->uid = cpu_to_le32(-1); else fe->uid = cpu_to_le32(i_uid_read(inode)); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_GID_FORGET)) fe->gid = cpu_to_le32(-1); else fe->gid = cpu_to_le32(i_gid_read(inode)); udfperms = ((inode->i_mode & 0007)) | ((inode->i_mode & 0070) << 2) | ((inode->i_mode & 0700) << 4); udfperms |= (le32_to_cpu(fe->permissions) & (FE_PERM_O_DELETE | FE_PERM_O_CHATTR | FE_PERM_G_DELETE | FE_PERM_G_CHATTR | FE_PERM_U_DELETE | FE_PERM_U_CHATTR)); fe->permissions = cpu_to_le32(udfperms); if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); else fe->fileLinkCount = cpu_to_le16(inode->i_nlink); fe->informationLength = cpu_to_le64(inode->i_size); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct regid *eid; struct deviceSpec *dsea = (struct deviceSpec *)udf_get_extendedattr(inode, 12, 1); if (!dsea) { dsea = (struct deviceSpec *) udf_add_extendedattr(inode, sizeof(struct deviceSpec) + sizeof(struct regid), 12, 0x3); dsea->attrType = cpu_to_le32(12); dsea->attrSubtype = 1; dsea->attrLength = cpu_to_le32( sizeof(struct deviceSpec) + sizeof(struct regid)); dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); } eid = (struct regid *)dsea->impUse; memset(eid, 0, sizeof(*eid)); strcpy(eid->ident, UDF_ID_DEVELOPER); eid->identSuffix[0] = UDF_OS_CLASS_UNIX; eid->identSuffix[1] = UDF_OS_ID_LINUX; dsea->majorDeviceIdent = cpu_to_le32(imajor(inode)); dsea->minorDeviceIdent = cpu_to_le32(iminor(inode)); } if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) lb_recorded = 0; /* No extents => no blocks! */ else lb_recorded = (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> (blocksize_bits - 9); if (iinfo->i_efe == 0) { memcpy(bh->b_data + sizeof(struct fileEntry), iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; fe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; fe->uniqueID = cpu_to_le64(iinfo->i_unique); fe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); fe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); fe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); fe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_FE); crclen = sizeof(struct fileEntry); } else { memcpy(bh->b_data + sizeof(struct extendedFileEntry), iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct extendedFileEntry)); efe->objectSize = cpu_to_le64(inode->i_size); efe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); udf_adjust_time(iinfo, inode->i_atime); udf_adjust_time(iinfo, inode->i_mtime); udf_adjust_time(iinfo, inode->i_ctime); udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; efe->uniqueID = cpu_to_le64(iinfo->i_unique); efe->lengthExtendedAttr = cpu_to_le32(iinfo->i_lenEAttr); efe->lengthAllocDescs = cpu_to_le32(iinfo->i_lenAlloc); efe->checkpoint = cpu_to_le32(iinfo->i_checkpoint); efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } finish: if (iinfo->i_strat4096) { fe->icbTag.strategyType = cpu_to_le16(4096); fe->icbTag.strategyParameter = cpu_to_le16(1); fe->icbTag.numEntries = cpu_to_le16(2); } else { fe->icbTag.strategyType = cpu_to_le16(4); fe->icbTag.numEntries = cpu_to_le16(1); } if (iinfo->i_use) fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE; else if (S_ISDIR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; else if (S_ISREG(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; else if (S_ISLNK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_SYMLINK; else if (S_ISBLK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_BLOCK; else if (S_ISCHR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_CHAR; else if (S_ISFIFO(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_FIFO; else if (S_ISSOCK(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_SOCKET; icbflags = iinfo->i_alloc_type | ((inode->i_mode & S_ISUID) ? ICBTAG_FLAG_SETUID : 0) | ((inode->i_mode & S_ISGID) ? ICBTAG_FLAG_SETGID : 0) | ((inode->i_mode & S_ISVTX) ? ICBTAG_FLAG_STICKY : 0) | (le16_to_cpu(fe->icbTag.flags) & ~(ICBTAG_FLAG_AD_MASK | ICBTAG_FLAG_SETUID | ICBTAG_FLAG_SETGID | ICBTAG_FLAG_STICKY)); fe->icbTag.flags = cpu_to_le16(icbflags); if (sbi->s_udfrev >= 0x0200) fe->descTag.descVersion = cpu_to_le16(3); else fe->descTag.descVersion = cpu_to_le16(2); fe->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); fe->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - sizeof(struct tag); fe->descTag.descCRCLength = cpu_to_le16(crclen); fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(struct tag), crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); set_buffer_uptodate(bh); unlock_buffer(bh); /* write the data blocks */ mark_buffer_dirty(bh); if (do_sync) { sync_dirty_buffer(bh); if (buffer_write_io_error(bh)) { udf_warn(inode->i_sb, "IO error syncing udf inode [%08lx]\n", inode->i_ino); err = -EIO; } } brelse(bh); return err; } struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, bool hidden_inode) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); int err; if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); err = udf_read_inode(inode, hidden_inode); if (err < 0) { iget_failed(inode); return ERR_PTR(err); } unlock_new_inode(inode); return inode; } int udf_setup_indirect_aext(struct inode *inode, int block, struct extent_position *epos) { struct super_block *sb = inode->i_sb; struct buffer_head *bh; struct allocExtDesc *aed; struct extent_position nepos; struct kernel_lb_addr neloc; int ver, adsize; if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; neloc.logicalBlockNum = block; neloc.partitionReferenceNum = epos->block.partitionReferenceNum; bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); if (!bh) return -EIO; lock_buffer(bh); memset(bh->b_data, 0x00, sb->s_blocksize); set_buffer_uptodate(bh); unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); aed = (struct allocExtDesc *)(bh->b_data); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) { aed->previousAllocExtLocation = cpu_to_le32(epos->block.logicalBlockNum); } aed->lengthAllocDescs = cpu_to_le32(0); if (UDF_SB(sb)->s_udfrev >= 0x0200) ver = 3; else ver = 2; udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block, sizeof(struct tag)); nepos.block = neloc; nepos.offset = sizeof(struct allocExtDesc); nepos.bh = bh; /* * Do we have to copy current last extent to make space for indirect * one? */ if (epos->offset + adsize > sb->s_blocksize) { struct kernel_lb_addr cp_loc; uint32_t cp_len; int cp_type; epos->offset -= adsize; cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0); cp_len |= ((uint32_t)cp_type) << 30; __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); udf_write_aext(inode, epos, &nepos.block, sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); } else { __udf_add_aext(inode, epos, &nepos.block, sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); } brelse(epos->bh); *epos = nepos; return 0; } /* * Append extent at the given position - should be the first free one in inode * / indirect extent. This function assumes there is enough space in the inode * or indirect extent. Use udf_add_aext() if you didn't check for this before. */ int __udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { struct udf_inode_info *iinfo = UDF_I(inode); struct allocExtDesc *aed; int adsize; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; if (!epos->bh) { WARN_ON(iinfo->i_lenAlloc != epos->offset - udf_file_entry_alloc_offset(inode)); } else { aed = (struct allocExtDesc *)epos->bh->b_data; WARN_ON(le32_to_cpu(aed->lengthAllocDescs) != epos->offset - sizeof(struct allocExtDesc)); WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize); } udf_write_aext(inode, epos, eloc, elen, inc); if (!epos->bh) { iinfo->i_lenAlloc += adsize; mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)epos->bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(epos->bh->b_data, epos->offset + (inc ? 0 : adsize)); else udf_update_tag(epos->bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(epos->bh, inode); } return 0; } /* * Append extent at given position - should be the first free one in inode * / indirect extent. Takes care of allocating and linking indirect blocks. */ int udf_add_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; struct super_block *sb = inode->i_sb; if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else return -EIO; if (epos->offset + (2 * adsize) > sb->s_blocksize) { int err; int new_block; new_block = udf_new_block(sb, NULL, epos->block.partitionReferenceNum, epos->block.logicalBlockNum, &err); if (!new_block) return -ENOSPC; err = udf_setup_indirect_aext(inode, new_block, epos); if (err) return err; } return __udf_add_aext(inode, epos, eloc, elen, inc); } void udf_write_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { int adsize; uint8_t *ptr; struct short_ad *sad; struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) ptr = iinfo->i_ext.i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; else ptr = epos->bh->b_data + epos->offset; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: sad = (struct short_ad *)ptr; sad->extLength = cpu_to_le32(elen); sad->extPosition = cpu_to_le32(eloc->logicalBlockNum); adsize = sizeof(struct short_ad); break; case ICBTAG_FLAG_AD_LONG: lad = (struct long_ad *)ptr; lad->extLength = cpu_to_le32(elen); lad->extLocation = cpu_to_lelb(*eloc); memset(lad->impUse, 0x00, sizeof(lad->impUse)); adsize = sizeof(struct long_ad); break; default: return; } if (epos->bh) { if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) { struct allocExtDesc *aed = (struct allocExtDesc *)epos->bh->b_data; udf_update_tag(epos->bh->b_data, le32_to_cpu(aed->lengthAllocDescs) + sizeof(struct allocExtDesc)); } mark_buffer_dirty_inode(epos->bh, inode); } else { mark_inode_dirty(inode); } if (inc) epos->offset += adsize; } /* * Only 1 indirect extent in a row really makes sense but allow upto 16 in case * someone does some weird stuff. */ #define UDF_MAX_INDIR_EXTS 16 int8_t udf_next_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int8_t etype; unsigned int indirections = 0; while ((etype = udf_current_aext(inode, epos, eloc, elen, inc)) == (EXT_NEXT_EXTENT_ALLOCDECS >> 30)) { int block; if (++indirections > UDF_MAX_INDIR_EXTS) { udf_err(inode->i_sb, "too many indirect extents in inode %lu\n", inode->i_ino); return -1; } epos->block = *eloc; epos->offset = sizeof(struct allocExtDesc); brelse(epos->bh); block = udf_get_lb_pblock(inode->i_sb, &epos->block, 0); epos->bh = udf_tread(inode->i_sb, block); if (!epos->bh) { udf_debug("reading block %d failed!\n", block); return -1; } } return etype; } int8_t udf_current_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t *elen, int inc) { int alen; int8_t etype; uint8_t *ptr; struct short_ad *sad; struct long_ad *lad; struct udf_inode_info *iinfo = UDF_I(inode); if (!epos->bh) { if (!epos->offset) epos->offset = udf_file_entry_alloc_offset(inode); ptr = iinfo->i_ext.i_data + epos->offset - udf_file_entry_alloc_offset(inode) + iinfo->i_lenEAttr; alen = udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc; } else { if (!epos->offset) epos->offset = sizeof(struct allocExtDesc); ptr = epos->bh->b_data + epos->offset; alen = sizeof(struct allocExtDesc) + le32_to_cpu(((struct allocExtDesc *)epos->bh->b_data)-> lengthAllocDescs); } switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: sad = udf_get_fileshortad(ptr, alen, &epos->offset, inc); if (!sad) return -1; etype = le32_to_cpu(sad->extLength) >> 30; eloc->logicalBlockNum = le32_to_cpu(sad->extPosition); eloc->partitionReferenceNum = iinfo->i_location.partitionReferenceNum; *elen = le32_to_cpu(sad->extLength) & UDF_EXTENT_LENGTH_MASK; break; case ICBTAG_FLAG_AD_LONG: lad = udf_get_filelongad(ptr, alen, &epos->offset, inc); if (!lad) return -1; etype = le32_to_cpu(lad->extLength) >> 30; *eloc = lelb_to_cpu(lad->extLocation); *elen = le32_to_cpu(lad->extLength) & UDF_EXTENT_LENGTH_MASK; break; default: udf_debug("alloc_type = %d unsupported\n", iinfo->i_alloc_type); return -1; } return etype; } static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos, struct kernel_lb_addr neloc, uint32_t nelen) { struct kernel_lb_addr oeloc; uint32_t oelen; int8_t etype; if (epos.bh) get_bh(epos.bh); while ((etype = udf_next_aext(inode, &epos, &oeloc, &oelen, 0)) != -1) { udf_write_aext(inode, &epos, &neloc, nelen, 1); neloc = oeloc; nelen = (etype << 30) | oelen; } udf_add_aext(inode, &epos, &neloc, nelen, 1); brelse(epos.bh); return (nelen >> 30); } int8_t udf_delete_aext(struct inode *inode, struct extent_position epos) { struct extent_position oepos; int adsize; int8_t etype; struct allocExtDesc *aed; struct udf_inode_info *iinfo; struct kernel_lb_addr eloc; uint32_t elen; if (epos.bh) { get_bh(epos.bh); get_bh(epos.bh); } iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else adsize = 0; oepos = epos; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) == -1) return -1; while ((etype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { udf_write_aext(inode, &oepos, &eloc, (etype << 30) | elen, 1); if (oepos.bh != epos.bh) { oepos.block = epos.block; brelse(oepos.bh); get_bh(epos.bh); oepos.bh = epos.bh; oepos.offset = epos.offset - adsize; } } memset(&eloc, 0x00, sizeof(struct kernel_lb_addr)); elen = 0; if (epos.bh != oepos.bh) { udf_free_blocks(inode->i_sb, inode, &epos.block, 0, 1); udf_write_aext(inode, &oepos, &eloc, elen, 1); udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= (adsize * 2); mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize)); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, oepos.offset - (2 * adsize)); else udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(oepos.bh, inode); } } else { udf_write_aext(inode, &oepos, &eloc, elen, 1); if (!oepos.bh) { iinfo->i_lenAlloc -= adsize; mark_inode_dirty(inode); } else { aed = (struct allocExtDesc *)oepos.bh->b_data; le32_add_cpu(&aed->lengthAllocDescs, -adsize); if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) udf_update_tag(oepos.bh->b_data, epos.offset - adsize); else udf_update_tag(oepos.bh->b_data, sizeof(struct allocExtDesc)); mark_buffer_dirty_inode(oepos.bh, inode); } } brelse(epos.bh); brelse(oepos.bh); return (elen >> 30); } int8_t inode_bmap(struct inode *inode, sector_t block, struct extent_position *pos, struct kernel_lb_addr *eloc, uint32_t *elen, sector_t *offset) { unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; loff_t lbcount = 0, bcount = (loff_t) block << blocksize_bits; int8_t etype; struct udf_inode_info *iinfo; iinfo = UDF_I(inode); if (!udf_read_extent_cache(inode, bcount, &lbcount, pos)) { pos->offset = 0; pos->block = iinfo->i_location; pos->bh = NULL; } *elen = 0; do { etype = udf_next_aext(inode, pos, eloc, elen, 1); if (etype == -1) { *offset = (bcount - lbcount) >> blocksize_bits; iinfo->i_lenExtents = lbcount; return -1; } lbcount += *elen; } while (lbcount <= bcount); /* update extent cache */ udf_update_extent_cache(inode, lbcount - *elen, pos); *offset = (bcount + *elen - lbcount) >> blocksize_bits; return etype; } long udf_block_map(struct inode *inode, sector_t block) { struct kernel_lb_addr eloc; uint32_t elen; sector_t offset; struct extent_position epos = {}; int ret; down_read(&UDF_I(inode)->i_data_sem); if (inode_bmap(inode, block, &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) ret = udf_get_lb_pblock(inode->i_sb, &eloc, offset); else ret = 0; up_read(&UDF_I(inode)->i_data_sem); brelse(epos.bh); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_VARCONV)) return udf_fixed_to_variable(ret); else return ret; }
47 589 582 819 865 3 142 9 51 569 137 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SIGNAL_H #define _LINUX_SIGNAL_H #include <linux/bug.h> #include <linux/signal_types.h> #include <linux/string.h> struct task_struct; /* for sysctl */ extern int print_fatal_signals; static inline void copy_siginfo(struct siginfo *to, struct siginfo *from) { if (from->si_code < 0) memcpy(to, from, sizeof(*to)); else /* _sigchld is currently the largest know union member */ memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld)); } int copy_siginfo_to_user(struct siginfo __user *to, const struct siginfo *from); enum siginfo_layout { SIL_KILL, SIL_TIMER, SIL_POLL, SIL_FAULT, SIL_CHLD, SIL_RT, #ifdef __ARCH_SIGSYS SIL_SYS, #endif }; enum siginfo_layout siginfo_layout(unsigned sig, int si_code); /* * Define some primitives to manipulate sigset_t. */ #ifndef __HAVE_ARCH_SIG_BITOPS #include <linux/bitops.h> /* We don't use <linux/bitops.h> for these because there is no need to be atomic. */ static inline void sigaddset(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) set->sig[0] |= 1UL << sig; else set->sig[sig / _NSIG_BPW] |= 1UL << (sig % _NSIG_BPW); } static inline void sigdelset(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) set->sig[0] &= ~(1UL << sig); else set->sig[sig / _NSIG_BPW] &= ~(1UL << (sig % _NSIG_BPW)); } static inline int sigismember(sigset_t *set, int _sig) { unsigned long sig = _sig - 1; if (_NSIG_WORDS == 1) return 1 & (set->sig[0] >> sig); else return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW)); } #endif /* __HAVE_ARCH_SIG_BITOPS */ static inline int sigisemptyset(sigset_t *set) { switch (_NSIG_WORDS) { case 4: return (set->sig[3] | set->sig[2] | set->sig[1] | set->sig[0]) == 0; case 2: return (set->sig[1] | set->sig[0]) == 0; case 1: return set->sig[0] == 0; default: BUILD_BUG(); return 0; } } static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) { switch (_NSIG_WORDS) { case 4: return (set1->sig[3] == set2->sig[3]) && (set1->sig[2] == set2->sig[2]) && (set1->sig[1] == set2->sig[1]) && (set1->sig[0] == set2->sig[0]); case 2: return (set1->sig[1] == set2->sig[1]) && (set1->sig[0] == set2->sig[0]); case 1: return set1->sig[0] == set2->sig[0]; } return 0; } #define sigmask(sig) (1UL << ((sig) - 1)) #ifndef __HAVE_ARCH_SIG_SETOPS #include <linux/string.h> #define _SIG_SET_BINOP(name, op) \ static inline void name(sigset_t *r, const sigset_t *a, const sigset_t *b) \ { \ unsigned long a0, a1, a2, a3, b0, b1, b2, b3; \ \ switch (_NSIG_WORDS) { \ case 4: \ a3 = a->sig[3]; a2 = a->sig[2]; \ b3 = b->sig[3]; b2 = b->sig[2]; \ r->sig[3] = op(a3, b3); \ r->sig[2] = op(a2, b2); \ case 2: \ a1 = a->sig[1]; b1 = b->sig[1]; \ r->sig[1] = op(a1, b1); \ case 1: \ a0 = a->sig[0]; b0 = b->sig[0]; \ r->sig[0] = op(a0, b0); \ break; \ default: \ BUILD_BUG(); \ } \ } #define _sig_or(x,y) ((x) | (y)) _SIG_SET_BINOP(sigorsets, _sig_or) #define _sig_and(x,y) ((x) & (y)) _SIG_SET_BINOP(sigandsets, _sig_and) #define _sig_andn(x,y) ((x) & ~(y)) _SIG_SET_BINOP(sigandnsets, _sig_andn) #undef _SIG_SET_BINOP #undef _sig_or #undef _sig_and #undef _sig_andn #define _SIG_SET_OP(name, op) \ static inline void name(sigset_t *set) \ { \ switch (_NSIG_WORDS) { \ case 4: set->sig[3] = op(set->sig[3]); \ set->sig[2] = op(set->sig[2]); \ case 2: set->sig[1] = op(set->sig[1]); \ case 1: set->sig[0] = op(set->sig[0]); \ break; \ default: \ BUILD_BUG(); \ } \ } #define _sig_not(x) (~(x)) _SIG_SET_OP(signotset, _sig_not) #undef _SIG_SET_OP #undef _sig_not static inline void sigemptyset(sigset_t *set) { switch (_NSIG_WORDS) { default: memset(set, 0, sizeof(sigset_t)); break; case 2: set->sig[1] = 0; case 1: set->sig[0] = 0; break; } } static inline void sigfillset(sigset_t *set) { switch (_NSIG_WORDS) { default: memset(set, -1, sizeof(sigset_t)); break; case 2: set->sig[1] = -1; case 1: set->sig[0] = -1; break; } } /* Some extensions for manipulating the low 32 signals in particular. */ static inline void sigaddsetmask(sigset_t *set, unsigned long mask) { set->sig[0] |= mask; } static inline void sigdelsetmask(sigset_t *set, unsigned long mask) { set->sig[0] &= ~mask; } static inline int sigtestsetmask(sigset_t *set, unsigned long mask) { return (set->sig[0] & mask) != 0; } static inline void siginitset(sigset_t *set, unsigned long mask) { set->sig[0] = mask; switch (_NSIG_WORDS) { default: memset(&set->sig[1], 0, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = 0; case 1: ; } } static inline void siginitsetinv(sigset_t *set, unsigned long mask) { set->sig[0] = ~mask; switch (_NSIG_WORDS) { default: memset(&set->sig[1], -1, sizeof(long)*(_NSIG_WORDS-1)); break; case 2: set->sig[1] = -1; case 1: ; } } #endif /* __HAVE_ARCH_SIG_SETOPS */ static inline void init_sigpending(struct sigpending *sig) { sigemptyset(&sig->signal); INIT_LIST_HEAD(&sig->list); } extern void flush_sigqueue(struct sigpending *queue); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) { return sig <= _NSIG ? 1 : 0; } struct timespec; struct pt_regs; extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, bool group); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); extern int sigprocmask(int, sigset_t *, sigset_t *); extern void set_current_blocked(sigset_t *); extern void __set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern int get_signal(struct ksignal *ksig); extern void signal_setup_done(int failed, struct ksignal *ksig, int stepping); extern void exit_signals(struct task_struct *tsk); extern void kernel_sigaction(int, __sighandler_t); #define SIG_KTHREAD ((__force __sighandler_t)2) #define SIG_KTHREAD_KERNEL ((__force __sighandler_t)3) static inline void allow_signal(int sig) { /* * Kernel threads handle their own signals. Let the signal code * know it'll be handled, so that they don't get converted to * SIGKILL or just silently dropped. */ kernel_sigaction(sig, SIG_KTHREAD); } static inline void allow_kernel_signal(int sig) { /* * Kernel threads handle their own signals. Let the signal code * know signals sent by the kernel will be handled, so that they * don't get silently dropped. */ kernel_sigaction(sig, SIG_KTHREAD_KERNEL); } static inline void disallow_signal(int sig) { kernel_sigaction(sig, SIG_IGN); } extern struct kmem_cache *sighand_cachep; int unhandled_signal(struct task_struct *tsk, int sig); /* * In POSIX a signal is sent either to a specific thread (Linux task) * or to the process as a whole (Linux thread group). How the signal * is sent determines whether it's to one thread or the whole group, * which determines which signal mask(s) are involved in blocking it * from being delivered until later. When the signal is delivered, * either it's caught or ignored by a user handler or it has a default * effect that applies to the whole thread group (POSIX process). * * The possible effects an unblocked signal set to SIG_DFL can have are: * ignore - Nothing Happens * terminate - kill the process, i.e. all threads in the group, * similar to exit_group. The group leader (only) reports * WIFSIGNALED status to its parent. * coredump - write a core dump file describing all threads using * the same mm and then kill all those threads * stop - stop all the threads in the group, i.e. TASK_STOPPED state * * SIGKILL and SIGSTOP cannot be caught, blocked, or ignored. * Other signals when not blocked and set to SIG_DFL behaves as follows. * The job control signals also have other special effects. * * +--------------------+------------------+ * | POSIX signal | default action | * +--------------------+------------------+ * | SIGHUP | terminate | * | SIGINT | terminate | * | SIGQUIT | coredump | * | SIGILL | coredump | * | SIGTRAP | coredump | * | SIGABRT/SIGIOT | coredump | * | SIGBUS | coredump | * | SIGFPE | coredump | * | SIGKILL | terminate(+) | * | SIGUSR1 | terminate | * | SIGSEGV | coredump | * | SIGUSR2 | terminate | * | SIGPIPE | terminate | * | SIGALRM | terminate | * | SIGTERM | terminate | * | SIGCHLD | ignore | * | SIGCONT | ignore(*) | * | SIGSTOP | stop(*)(+) | * | SIGTSTP | stop(*) | * | SIGTTIN | stop(*) | * | SIGTTOU | stop(*) | * | SIGURG | ignore | * | SIGXCPU | coredump | * | SIGXFSZ | coredump | * | SIGVTALRM | terminate | * | SIGPROF | terminate | * | SIGPOLL/SIGIO | terminate | * | SIGSYS/SIGUNUSED | coredump | * | SIGSTKFLT | terminate | * | SIGWINCH | ignore | * | SIGPWR | terminate | * | SIGRTMIN-SIGRTMAX | terminate | * +--------------------+------------------+ * | non-POSIX signal | default action | * +--------------------+------------------+ * | SIGEMT | coredump | * +--------------------+------------------+ * * (+) For SIGKILL and SIGSTOP the action is "always", not just "default". * (*) Special job control effects: * When SIGCONT is sent, it resumes the process (all threads in the group) * from TASK_STOPPED state and also clears any pending/queued stop signals * (any of those marked with "stop(*)"). This happens regardless of blocking, * catching, or ignoring SIGCONT. When any stop signal is sent, it clears * any pending/queued SIGCONT signals; this happens regardless of blocking, * catching, or ignored the stop signal, though (except for SIGSTOP) the * default action of stopping the process may happen later or never. */ #ifdef SIGEMT #define SIGEMT_MASK rt_sigmask(SIGEMT) #else #define SIGEMT_MASK 0 #endif #if SIGRTMIN > BITS_PER_LONG #define rt_sigmask(sig) (1ULL << ((sig)-1)) #else #define rt_sigmask(sig) sigmask(sig) #endif #define siginmask(sig, mask) \ ((sig) < SIGRTMIN && (rt_sigmask(sig) & (mask))) #define SIG_KERNEL_ONLY_MASK (\ rt_sigmask(SIGKILL) | rt_sigmask(SIGSTOP)) #define SIG_KERNEL_STOP_MASK (\ rt_sigmask(SIGSTOP) | rt_sigmask(SIGTSTP) | \ rt_sigmask(SIGTTIN) | rt_sigmask(SIGTTOU) ) #define SIG_KERNEL_COREDUMP_MASK (\ rt_sigmask(SIGQUIT) | rt_sigmask(SIGILL) | \ rt_sigmask(SIGTRAP) | rt_sigmask(SIGABRT) | \ rt_sigmask(SIGFPE) | rt_sigmask(SIGSEGV) | \ rt_sigmask(SIGBUS) | rt_sigmask(SIGSYS) | \ rt_sigmask(SIGXCPU) | rt_sigmask(SIGXFSZ) | \ SIGEMT_MASK ) #define SIG_KERNEL_IGNORE_MASK (\ rt_sigmask(SIGCONT) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGWINCH) | rt_sigmask(SIGURG) ) #define SIG_SPECIFIC_SICODES_MASK (\ rt_sigmask(SIGILL) | rt_sigmask(SIGFPE) | \ rt_sigmask(SIGSEGV) | rt_sigmask(SIGBUS) | \ rt_sigmask(SIGTRAP) | rt_sigmask(SIGCHLD) | \ rt_sigmask(SIGPOLL) | rt_sigmask(SIGSYS) | \ SIGEMT_MASK ) #define sig_kernel_only(sig) siginmask(sig, SIG_KERNEL_ONLY_MASK) #define sig_kernel_coredump(sig) siginmask(sig, SIG_KERNEL_COREDUMP_MASK) #define sig_kernel_ignore(sig) siginmask(sig, SIG_KERNEL_IGNORE_MASK) #define sig_kernel_stop(sig) siginmask(sig, SIG_KERNEL_STOP_MASK) #define sig_specific_sicodes(sig) siginmask(sig, SIG_SPECIFIC_SICODES_MASK) #define sig_fatal(t, signr) \ (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) void signals_init(void); int restore_altstack(const stack_t __user *); int __save_altstack(stack_t __user *, unsigned long); #define save_altstack_ex(uss, sp) do { \ stack_t __user *__uss = uss; \ struct task_struct *t = current; \ put_user_ex((void __user *)t->sas_ss_sp, &__uss->ss_sp); \ put_user_ex(t->sas_ss_flags, &__uss->ss_flags); \ put_user_ex(t->sas_ss_size, &__uss->ss_size); \ if (t->sas_ss_flags & SS_AUTODISARM) \ sas_ss_reset(t); \ } while (0); #ifdef CONFIG_PROC_FS struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); #endif #endif /* _LINUX_SIGNAL_H */
1 35 35 1 1 1 2 1379 1380 1379 1381 1381 1381 1381 1381 1380 1381 1381 1337 319 1381 1381 1380 1381 1381 1287 1381 1381 1381 1 1381 35 35 35 35 35 35 1 43 43 43 43 43 42 2 1 15 15 1 1 1 1 12 12 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 /* * sd.c Copyright (C) 1992 Drew Eckhardt * Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale * * Linux scsi disk driver * Initial versions: Drew Eckhardt * Subsequent revisions: Eric Youngdale * Modification history: * - Drew Eckhardt <drew@colorado.edu> original * - Eric Youngdale <eric@andante.org> add scatter-gather, multiple * outstanding request, and other enhancements. * Support loadable low-level scsi drivers. * - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using * eight major numbers. * - Richard Gooch <rgooch@atnf.csiro.au> support devfs. * - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in * sd_init and cleanups. * - Alex Davis <letmein@erols.com> Fix problem where partition info * not being read in sd_open. Fix problem where removable media * could be ejected after sd_open. * - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x * - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox * <willy@debian.org>, Kurt Garloff <garloff@suse.de>: * Support 32k/1M disks. * * Logging policy (needs CONFIG_SCSI_LOGGING defined): * - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2 * - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1 * - entering sd_ioctl: SCSI_LOG_IOCTL level 1 * - entering other commands: SCSI_LOG_HLQUEUE level 3 * Note: when the logging level is set by the user, it must be greater * than the level indicated above to trigger output. */ #include <linux/module.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/bio.h> #include <linux/genhd.h> #include <linux/hdreg.h> #include <linux/errno.h> #include <linux/idr.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/delay.h> #include <linux/mutex.h> #include <linux/string_helpers.h> #include <linux/async.h> #include <linux/slab.h> #include <linux/sed-opal.h> #include <linux/pm_runtime.h> #include <linux/pr.h> #include <linux/t10-pi.h> #include <linux/uaccess.h> #include <asm/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsicam.h> #include "sd.h" #include "scsi_priv.h" #include "scsi_logging.h" MODULE_AUTHOR("Eric Youngdale"); MODULE_DESCRIPTION("SCSI disk (sd) driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR); MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR); MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK); MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD); MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC); MODULE_ALIAS_SCSI_DEVICE(TYPE_ZBC); #if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT) #define SD_MINORS 16 #else #define SD_MINORS 0 #endif static void sd_config_discard(struct scsi_disk *, unsigned int); static void sd_config_write_same(struct scsi_disk *); static int sd_revalidate_disk(struct gendisk *); static void sd_unlock_native_capacity(struct gendisk *disk); static int sd_probe(struct device *); static int sd_remove(struct device *); static void sd_shutdown(struct device *); static int sd_suspend_system(struct device *); static int sd_suspend_runtime(struct device *); static int sd_resume(struct device *); static void sd_rescan(struct device *); static int sd_init_command(struct scsi_cmnd *SCpnt); static void sd_uninit_command(struct scsi_cmnd *SCpnt); static int sd_done(struct scsi_cmnd *); static void sd_eh_reset(struct scsi_cmnd *); static int sd_eh_action(struct scsi_cmnd *, int); static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer); static void scsi_disk_release(struct device *cdev); static void sd_print_sense_hdr(struct scsi_disk *, struct scsi_sense_hdr *); static void sd_print_result(const struct scsi_disk *, const char *, int); static DEFINE_SPINLOCK(sd_index_lock); static DEFINE_IDA(sd_index_ida); /* This semaphore is used to mediate the 0->1 reference get in the * face of object destruction (i.e. we can't allow a get on an * object after last put) */ static DEFINE_MUTEX(sd_ref_mutex); static struct kmem_cache *sd_cdb_cache; static mempool_t *sd_cdb_pool; static mempool_t *sd_page_pool; static const char *sd_cache_types[] = { "write through", "none", "write back", "write back, no read (daft)" }; static void sd_set_flush_flag(struct scsi_disk *sdkp) { bool wc = false, fua = false; if (sdkp->WCE) { wc = true; if (sdkp->DPOFUA) fua = true; } blk_queue_write_cache(sdkp->disk->queue, wc, fua); } static ssize_t cache_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int ct, rcd, wce, sp; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; char buffer[64]; char *buffer_data; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; static const char temp[] = "temporary "; int len; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) /* no cache control on RBC devices; theoretically they * can do it, but there's probably so many exceptions * it's not worth the risk */ return -EINVAL; if (strncmp(buf, temp, sizeof(temp) - 1) == 0) { buf += sizeof(temp) - 1; sdkp->cache_override = 1; } else { sdkp->cache_override = 0; } ct = sysfs_match_string(sd_cache_types, buf); if (ct < 0) return -EINVAL; rcd = ct & 0x01 ? 1 : 0; wce = (ct & 0x02) && !sdkp->write_prot ? 1 : 0; if (sdkp->cache_override) { sdkp->WCE = wce; sdkp->RCD = rcd; sd_set_flush_flag(sdkp); return count; } if (scsi_mode_sense(sdp, 0x08, 8, buffer, sizeof(buffer), SD_TIMEOUT, SD_MAX_RETRIES, &data, NULL)) return -EINVAL; len = min_t(size_t, sizeof(buffer), data.length - data.header_length - data.block_descriptor_length); buffer_data = buffer + data.header_length + data.block_descriptor_length; buffer_data[2] &= ~0x05; buffer_data[2] |= wce << 2 | rcd; sp = buffer_data[0] & 0x80 ? 1 : 0; buffer_data[0] &= ~0x80; /* * Ensure WP, DPOFUA, and RESERVED fields are cleared in * received mode parameter buffer before doing MODE SELECT. */ data.device_specific = 0; if (scsi_mode_select(sdp, 1, sp, 8, buffer_data, len, SD_TIMEOUT, SD_MAX_RETRIES, &data, &sshdr)) { if (scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return -EINVAL; } revalidate_disk(sdkp->disk); return count; } static ssize_t manage_start_stop_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; return sprintf(buf, "%u\n", sdp->manage_start_stop); } static ssize_t manage_start_stop_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; bool v; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (kstrtobool(buf, &v)) return -EINVAL; sdp->manage_start_stop = v; return count; } static DEVICE_ATTR_RW(manage_start_stop); static ssize_t allow_restart_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->device->allow_restart); } static ssize_t allow_restart_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { bool v; struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; if (kstrtobool(buf, &v)) return -EINVAL; sdp->allow_restart = v; return count; } static DEVICE_ATTR_RW(allow_restart); static ssize_t cache_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); int ct = sdkp->RCD + 2*sdkp->WCE; return sprintf(buf, "%s\n", sd_cache_types[ct]); } static DEVICE_ATTR_RW(cache_type); static ssize_t FUA_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->DPOFUA); } static DEVICE_ATTR_RO(FUA); static ssize_t protection_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->protection_type); } static ssize_t protection_type_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); unsigned int val; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &val); if (err) return err; if (val <= T10_PI_TYPE3_PROTECTION) sdkp->protection_type = val; return count; } static DEVICE_ATTR_RW(protection_type); static ssize_t protection_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; unsigned int dif, dix; dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); dix = scsi_host_dix_capable(sdp->host, sdkp->protection_type); if (!dix && scsi_host_dix_capable(sdp->host, T10_PI_TYPE0_PROTECTION)) { dif = 0; dix = 1; } if (!dif && !dix) return sprintf(buf, "none\n"); return sprintf(buf, "%s%u\n", dix ? "dix" : "dif", dif); } static DEVICE_ATTR_RO(protection_mode); static ssize_t app_tag_own_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->ATO); } static DEVICE_ATTR_RO(app_tag_own); static ssize_t thin_provisioning_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->lbpme); } static DEVICE_ATTR_RO(thin_provisioning); /* sysfs_match_string() requires dense arrays */ static const char *lbp_mode[] = { [SD_LBP_FULL] = "full", [SD_LBP_UNMAP] = "unmap", [SD_LBP_WS16] = "writesame_16", [SD_LBP_WS10] = "writesame_10", [SD_LBP_ZERO] = "writesame_zero", [SD_LBP_DISABLE] = "disabled", }; static ssize_t provisioning_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", lbp_mode[sdkp->provisioning_mode]); } static ssize_t provisioning_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; int mode; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sd_is_zoned(sdkp)) { sd_config_discard(sdkp, SD_LBP_DISABLE); return count; } if (sdp->type != TYPE_DISK) return -EINVAL; mode = sysfs_match_string(lbp_mode, buf); if (mode < 0) return -EINVAL; sd_config_discard(sdkp, mode); return count; } static DEVICE_ATTR_RW(provisioning_mode); /* sysfs_match_string() requires dense arrays */ static const char *zeroing_mode[] = { [SD_ZERO_WRITE] = "write", [SD_ZERO_WS] = "writesame", [SD_ZERO_WS16_UNMAP] = "writesame_16_unmap", [SD_ZERO_WS10_UNMAP] = "writesame_10_unmap", }; static ssize_t zeroing_mode_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%s\n", zeroing_mode[sdkp->zeroing_mode]); } static ssize_t zeroing_mode_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int mode; if (!capable(CAP_SYS_ADMIN)) return -EACCES; mode = sysfs_match_string(zeroing_mode, buf); if (mode < 0) return -EINVAL; sdkp->zeroing_mode = mode; return count; } static DEVICE_ATTR_RW(zeroing_mode); static ssize_t max_medium_access_timeouts_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_medium_access_timeouts); } static ssize_t max_medium_access_timeouts_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; err = kstrtouint(buf, 10, &sdkp->max_medium_access_timeouts); return err ? err : count; } static DEVICE_ATTR_RW(max_medium_access_timeouts); static ssize_t max_write_same_blocks_show(struct device *dev, struct device_attribute *attr, char *buf) { struct scsi_disk *sdkp = to_scsi_disk(dev); return sprintf(buf, "%u\n", sdkp->max_ws_blocks); } static ssize_t max_write_same_blocks_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct scsi_device *sdp = sdkp->device; unsigned long max; int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return -EINVAL; err = kstrtoul(buf, 10, &max); if (err) return err; if (max == 0) sdp->no_write_same = 1; else if (max <= SD_MAX_WS16_BLOCKS) { sdp->no_write_same = 0; sdkp->max_ws_blocks = max; } sd_config_write_same(sdkp); return count; } static DEVICE_ATTR_RW(max_write_same_blocks); static struct attribute *sd_disk_attrs[] = { &dev_attr_cache_type.attr, &dev_attr_FUA.attr, &dev_attr_allow_restart.attr, &dev_attr_manage_start_stop.attr, &dev_attr_protection_type.attr, &dev_attr_protection_mode.attr, &dev_attr_app_tag_own.attr, &dev_attr_thin_provisioning.attr, &dev_attr_provisioning_mode.attr, &dev_attr_zeroing_mode.attr, &dev_attr_max_write_same_blocks.attr, &dev_attr_max_medium_access_timeouts.attr, NULL, }; ATTRIBUTE_GROUPS(sd_disk); static struct class sd_disk_class = { .name = "scsi_disk", .owner = THIS_MODULE, .dev_release = scsi_disk_release, .dev_groups = sd_disk_groups, }; static const struct dev_pm_ops sd_pm_ops = { .suspend = sd_suspend_system, .resume = sd_resume, .poweroff = sd_suspend_system, .restore = sd_resume, .runtime_suspend = sd_suspend_runtime, .runtime_resume = sd_resume, }; static struct scsi_driver sd_template = { .gendrv = { .name = "sd", .owner = THIS_MODULE, .probe = sd_probe, .remove = sd_remove, .shutdown = sd_shutdown, .pm = &sd_pm_ops, }, .rescan = sd_rescan, .init_command = sd_init_command, .uninit_command = sd_uninit_command, .done = sd_done, .eh_action = sd_eh_action, .eh_reset = sd_eh_reset, }; /* * Dummy kobj_map->probe function. * The default ->probe function will call modprobe, which is * pointless as this module is already loaded. */ static struct kobject *sd_default_probe(dev_t devt, int *partno, void *data) { return NULL; } /* * Device no to disk mapping: * * major disc2 disc p1 * |............|.............|....|....| <- dev_t * 31 20 19 8 7 4 3 0 * * Inside a major, we have 16k disks, however mapped non- * contiguously. The first 16 disks are for major0, the next * ones with major1, ... Disk 256 is for major0 again, disk 272 * for major1, ... * As we stay compatible with our numbering scheme, we can reuse * the well-know SCSI majors 8, 65--71, 136--143. */ static int sd_major(int major_idx) { switch (major_idx) { case 0: return SCSI_DISK0_MAJOR; case 1 ... 7: return SCSI_DISK1_MAJOR + major_idx - 1; case 8 ... 15: return SCSI_DISK8_MAJOR + major_idx - 8; default: BUG(); return 0; /* shut up gcc */ } } static struct scsi_disk *scsi_disk_get(struct gendisk *disk) { struct scsi_disk *sdkp = NULL; mutex_lock(&sd_ref_mutex); if (disk->private_data) { sdkp = scsi_disk(disk); if (scsi_device_get(sdkp->device) == 0) get_device(&sdkp->dev); else sdkp = NULL; } mutex_unlock(&sd_ref_mutex); return sdkp; } static void scsi_disk_put(struct scsi_disk *sdkp) { struct scsi_device *sdev = sdkp->device; mutex_lock(&sd_ref_mutex); put_device(&sdkp->dev); scsi_device_put(sdev); mutex_unlock(&sd_ref_mutex); } #ifdef CONFIG_BLK_SED_OPAL static int sd_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send) { struct scsi_device *sdev = data; u8 cdb[12] = { 0, }; int ret; cdb[0] = send ? SECURITY_PROTOCOL_OUT : SECURITY_PROTOCOL_IN; cdb[1] = secp; put_unaligned_be16(spsp, &cdb[2]); put_unaligned_be32(len, &cdb[6]); ret = scsi_execute_req(sdev, cdb, send ? DMA_TO_DEVICE : DMA_FROM_DEVICE, buffer, len, NULL, SD_TIMEOUT, SD_MAX_RETRIES, NULL); return ret <= 0 ? ret : -EIO; } #endif /* CONFIG_BLK_SED_OPAL */ static unsigned char sd_setup_protect_cmnd(struct scsi_cmnd *scmd, unsigned int dix, unsigned int dif) { struct bio *bio = scmd->request->bio; unsigned int prot_op = sd_prot_op(rq_data_dir(scmd->request), dix, dif); unsigned int protect = 0; if (dix) { /* DIX Type 0, 1, 2, 3 */ if (bio_integrity_flagged(bio, BIP_IP_CHECKSUM)) scmd->prot_flags |= SCSI_PROT_IP_CHECKSUM; if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) scmd->prot_flags |= SCSI_PROT_GUARD_CHECK; } if (dif != T10_PI_TYPE3_PROTECTION) { /* DIX/DIF Type 0, 1, 2 */ scmd->prot_flags |= SCSI_PROT_REF_INCREMENT; if (bio_integrity_flagged(bio, BIP_CTRL_NOCHECK) == false) scmd->prot_flags |= SCSI_PROT_REF_CHECK; } if (dif) { /* DIX/DIF Type 1, 2, 3 */ scmd->prot_flags |= SCSI_PROT_TRANSFER_PI; if (bio_integrity_flagged(bio, BIP_DISK_NOCHECK)) protect = 3 << 5; /* Disable target PI checking */ else protect = 1 << 5; /* Enable target PI checking */ } scsi_set_prot_op(scmd, prot_op); scsi_set_prot_type(scmd, dif); scmd->prot_flags &= sd_prot_flag_mask(prot_op); return protect; } static void sd_config_discard(struct scsi_disk *sdkp, unsigned int mode) { struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; unsigned int max_blocks = 0; q->limits.discard_alignment = sdkp->unmap_alignment * logical_block_size; q->limits.discard_granularity = max(sdkp->physical_block_size, sdkp->unmap_granularity * logical_block_size); sdkp->provisioning_mode = mode; switch (mode) { case SD_LBP_FULL: case SD_LBP_DISABLE: blk_queue_max_discard_sectors(q, 0); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); return; case SD_LBP_UNMAP: max_blocks = min_not_zero(sdkp->max_unmap_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS16: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS16_BLOCKS); break; case SD_LBP_WS10: if (sdkp->device->unmap_limit_for_ws) max_blocks = sdkp->max_unmap_blocks; else max_blocks = sdkp->max_ws_blocks; max_blocks = min_not_zero(max_blocks, (u32)SD_MAX_WS10_BLOCKS); break; case SD_LBP_ZERO: max_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); break; } blk_queue_max_discard_sectors(q, max_blocks * (logical_block_size >> 9)); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); } static int sd_setup_unmap_cmnd(struct scsi_cmnd *cmd) { struct scsi_device *sdp = cmd->device; struct request *rq = cmd->request; u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9); u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9); unsigned int data_len = 24; char *buf; rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!rq->special_vec.bv_page) return BLKPREP_DEFER; clear_highpage(rq->special_vec.bv_page); rq->special_vec.bv_offset = 0; rq->special_vec.bv_len = data_len; rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 10; cmd->cmnd[0] = UNMAP; cmd->cmnd[8] = 24; buf = page_address(rq->special_vec.bv_page); put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(sector, &buf[8]); put_unaligned_be32(nr_sectors, &buf[16]); cmd->allowed = SD_MAX_RETRIES; cmd->transfersize = data_len; rq->timeout = SD_TIMEOUT; scsi_req(rq)->resid_len = data_len; return scsi_init_io(cmd); } static int sd_setup_write_same16_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = cmd->request; u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9); u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9); u32 data_len = sdp->sector_size; rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!rq->special_vec.bv_page) return BLKPREP_DEFER; clear_highpage(rq->special_vec.bv_page); rq->special_vec.bv_offset = 0; rq->special_vec.bv_len = data_len; rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be64(sector, &cmd->cmnd[2]); put_unaligned_be32(nr_sectors, &cmd->cmnd[10]); cmd->allowed = SD_MAX_RETRIES; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; scsi_req(rq)->resid_len = data_len; return scsi_init_io(cmd); } static int sd_setup_write_same10_cmnd(struct scsi_cmnd *cmd, bool unmap) { struct scsi_device *sdp = cmd->device; struct request *rq = cmd->request; u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9); u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9); u32 data_len = sdp->sector_size; rq->special_vec.bv_page = mempool_alloc(sd_page_pool, GFP_ATOMIC); if (!rq->special_vec.bv_page) return BLKPREP_DEFER; clear_highpage(rq->special_vec.bv_page); rq->special_vec.bv_offset = 0; rq->special_vec.bv_len = data_len; rq->rq_flags |= RQF_SPECIAL_PAYLOAD; cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; if (unmap) cmd->cmnd[1] = 0x8; /* UNMAP */ put_unaligned_be32(sector, &cmd->cmnd[2]); put_unaligned_be16(nr_sectors, &cmd->cmnd[7]); cmd->allowed = SD_MAX_RETRIES; cmd->transfersize = data_len; rq->timeout = unmap ? SD_TIMEOUT : SD_WRITE_SAME_TIMEOUT; scsi_req(rq)->resid_len = data_len; return scsi_init_io(cmd); } static int sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); u64 sector = blk_rq_pos(rq) >> (ilog2(sdp->sector_size) - 9); u32 nr_sectors = blk_rq_sectors(rq) >> (ilog2(sdp->sector_size) - 9); int ret; if (!(rq->cmd_flags & REQ_NOUNMAP)) { switch (sdkp->zeroing_mode) { case SD_ZERO_WS16_UNMAP: ret = sd_setup_write_same16_cmnd(cmd, true); goto out; case SD_ZERO_WS10_UNMAP: ret = sd_setup_write_same10_cmnd(cmd, true); goto out; } } if (sdp->no_write_same) return BLKPREP_INVALID; if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff) ret = sd_setup_write_same16_cmnd(cmd, false); else ret = sd_setup_write_same10_cmnd(cmd, false); out: if (sd_is_zoned(sdkp) && ret == BLKPREP_OK) return sd_zbc_write_lock_zone(cmd); return ret; } static void sd_config_write_same(struct scsi_disk *sdkp) { struct request_queue *q = sdkp->disk->queue; unsigned int logical_block_size = sdkp->device->sector_size; if (sdkp->device->no_write_same) { sdkp->max_ws_blocks = 0; goto out; } /* Some devices can not handle block counts above 0xffff despite * supporting WRITE SAME(16). Consequently we default to 64k * blocks per I/O unless the device explicitly advertises a * bigger limit. */ if (sdkp->max_ws_blocks > SD_MAX_WS10_BLOCKS) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS16_BLOCKS); else if (sdkp->ws16 || sdkp->ws10 || sdkp->device->no_report_opcodes) sdkp->max_ws_blocks = min_not_zero(sdkp->max_ws_blocks, (u32)SD_MAX_WS10_BLOCKS); else { sdkp->device->no_write_same = 1; sdkp->max_ws_blocks = 0; } if (sdkp->lbprz && sdkp->lbpws) sdkp->zeroing_mode = SD_ZERO_WS16_UNMAP; else if (sdkp->lbprz && sdkp->lbpws10) sdkp->zeroing_mode = SD_ZERO_WS10_UNMAP; else if (sdkp->max_ws_blocks) sdkp->zeroing_mode = SD_ZERO_WS; else sdkp->zeroing_mode = SD_ZERO_WRITE; out: blk_queue_max_write_same_sectors(q, sdkp->max_ws_blocks * (logical_block_size >> 9)); blk_queue_max_write_zeroes_sectors(q, sdkp->max_ws_blocks * (logical_block_size >> 9)); } /** * sd_setup_write_same_cmnd - write the same data to multiple blocks * @cmd: command to prepare * * Will set up either WRITE SAME(10) or WRITE SAME(16) depending on * the preference indicated by the target device. **/ static int sd_setup_write_same_cmnd(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; struct scsi_device *sdp = cmd->device; struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); struct bio *bio = rq->bio; sector_t sector = blk_rq_pos(rq); unsigned int nr_sectors = blk_rq_sectors(rq); unsigned int nr_bytes = blk_rq_bytes(rq); int ret; if (sdkp->device->no_write_same) return BLKPREP_INVALID; BUG_ON(bio_offset(bio) || bio_iovec(bio).bv_len != sdp->sector_size); if (sd_is_zoned(sdkp)) { ret = sd_zbc_write_lock_zone(cmd); if (ret != BLKPREP_OK) return ret; } sector >>= ilog2(sdp->sector_size) - 9; nr_sectors >>= ilog2(sdp->sector_size) - 9; rq->timeout = SD_WRITE_SAME_TIMEOUT; if (sdkp->ws16 || sector > 0xffffffff || nr_sectors > 0xffff) { cmd->cmd_len = 16; cmd->cmnd[0] = WRITE_SAME_16; put_unaligned_be64(sector, &cmd->cmnd[2]); put_unaligned_be32(nr_sectors, &cmd->cmnd[10]); } else { cmd->cmd_len = 10; cmd->cmnd[0] = WRITE_SAME; put_unaligned_be32(sector, &cmd->cmnd[2]); put_unaligned_be16(nr_sectors, &cmd->cmnd[7]); } cmd->transfersize = sdp->sector_size; cmd->allowed = SD_MAX_RETRIES; /* * For WRITE SAME the data transferred via the DATA OUT buffer is * different from the amount of data actually written to the target. * * We set up __data_len to the amount of data transferred via the * DATA OUT buffer so that blk_rq_map_sg sets up the proper S/G list * to transfer a single sector of data first, but then reset it to * the amount of data to be written right after so that the I/O path * knows how much to actually write. */ rq->__data_len = sdp->sector_size; ret = scsi_init_io(cmd); rq->__data_len = nr_bytes; if (sd_is_zoned(sdkp) && ret != BLKPREP_OK) sd_zbc_write_unlock_zone(cmd); return ret; } static int sd_setup_flush_cmnd(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; /* flush requests don't perform I/O, zero the S/G table */ memset(&cmd->sdb, 0, sizeof(cmd->sdb)); cmd->cmnd[0] = SYNCHRONIZE_CACHE; cmd->cmd_len = 10; cmd->transfersize = 0; cmd->allowed = SD_MAX_RETRIES; rq->timeout = rq->q->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; return BLKPREP_OK; } static int sd_setup_read_write_cmnd(struct scsi_cmnd *SCpnt) { struct request *rq = SCpnt->request; struct scsi_device *sdp = SCpnt->device; struct gendisk *disk = rq->rq_disk; struct scsi_disk *sdkp = scsi_disk(disk); sector_t block = blk_rq_pos(rq); sector_t threshold; unsigned int this_count = blk_rq_sectors(rq); unsigned int dif, dix; bool zoned_write = sd_is_zoned(sdkp) && rq_data_dir(rq) == WRITE; int ret; unsigned char protect; if (zoned_write) { ret = sd_zbc_write_lock_zone(SCpnt); if (ret != BLKPREP_OK) return ret; } ret = scsi_init_io(SCpnt); if (ret != BLKPREP_OK) goto out; WARN_ON_ONCE(SCpnt != rq->special); /* from here on until we're complete, any goto out * is used for a killable error condition */ ret = BLKPREP_KILL; SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt, "%s: block=%llu, count=%d\n", __func__, (unsigned long long)block, this_count)); if (!sdp || !scsi_device_online(sdp) || block + blk_rq_sectors(rq) > get_capacity(disk)) { SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "Finishing %u sectors\n", blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "Retry with 0x%p\n", SCpnt)); goto out; } if (sdp->changed) { /* * quietly refuse to do anything to a changed disc until * the changed bit has been reset */ /* printk("SCSI disk has been changed or is not present. Prohibiting further I/O.\n"); */ goto out; } /* * Some SD card readers can't handle multi-sector accesses which touch * the last one or two hardware sectors. Split accesses as needed. */ threshold = get_capacity(disk) - SD_LAST_BUGGY_SECTORS * (sdp->sector_size / 512); if (unlikely(sdp->last_sector_bug && block + this_count > threshold)) { if (block < threshold) { /* Access up to the threshold but not beyond */ this_count = threshold - block; } else { /* Access only a single hardware sector */ this_count = sdp->sector_size / 512; } } SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "block=%llu\n", (unsigned long long)block)); /* * If we have a 1K hardware sectorsize, prevent access to single * 512 byte sectors. In theory we could handle this - in fact * the scsi cdrom driver must be able to handle this because * we typically use 1K blocksizes, and cdroms typically have * 2K hardware sectorsizes. Of course, things are simpler * with the cdrom, since it is read-only. For performance * reasons, the filesystems should be able to handle this * and not force the scsi disk driver to use bounce buffers * for this. */ if (sdp->sector_size == 1024) { if ((block & 1) || (blk_rq_sectors(rq) & 1)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); goto out; } else { block = block >> 1; this_count = this_count >> 1; } } if (sdp->sector_size == 2048) { if ((block & 3) || (blk_rq_sectors(rq) & 3)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); goto out; } else { block = block >> 2; this_count = this_count >> 2; } } if (sdp->sector_size == 4096) { if ((block & 7) || (blk_rq_sectors(rq) & 7)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); goto out; } else { block = block >> 3; this_count = this_count >> 3; } } if (rq_data_dir(rq) == WRITE) { SCpnt->cmnd[0] = WRITE_6; if (blk_integrity_rq(rq)) sd_dif_prepare(SCpnt); } else if (rq_data_dir(rq) == READ) { SCpnt->cmnd[0] = READ_6; } else { scmd_printk(KERN_ERR, SCpnt, "Unknown command %d\n", req_op(rq)); goto out; } SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "%s %d/%u 512 byte blocks.\n", (rq_data_dir(rq) == WRITE) ? "writing" : "reading", this_count, blk_rq_sectors(rq))); dix = scsi_prot_sg_count(SCpnt); dif = scsi_host_dif_capable(SCpnt->device->host, sdkp->protection_type); if (dif || dix) protect = sd_setup_protect_cmnd(SCpnt, dix, dif); else protect = 0; if (protect && sdkp->protection_type == T10_PI_TYPE2_PROTECTION) { SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC); if (unlikely(SCpnt->cmnd == NULL)) { ret = BLKPREP_DEFER; goto out; } SCpnt->cmd_len = SD_EXT_CDB_SIZE; memset(SCpnt->cmnd, 0, SCpnt->cmd_len); SCpnt->cmnd[0] = VARIABLE_LENGTH_CMD; SCpnt->cmnd[7] = 0x18; SCpnt->cmnd[9] = (rq_data_dir(rq) == READ) ? READ_32 : WRITE_32; SCpnt->cmnd[10] = protect | ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0); /* LBA */ SCpnt->cmnd[12] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0; SCpnt->cmnd[13] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0; SCpnt->cmnd[14] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0; SCpnt->cmnd[15] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0; SCpnt->cmnd[16] = (unsigned char) (block >> 24) & 0xff; SCpnt->cmnd[17] = (unsigned char) (block >> 16) & 0xff; SCpnt->cmnd[18] = (unsigned char) (block >> 8) & 0xff; SCpnt->cmnd[19] = (unsigned char) block & 0xff; /* Expected Indirect LBA */ SCpnt->cmnd[20] = (unsigned char) (block >> 24) & 0xff; SCpnt->cmnd[21] = (unsigned char) (block >> 16) & 0xff; SCpnt->cmnd[22] = (unsigned char) (block >> 8) & 0xff; SCpnt->cmnd[23] = (unsigned char) block & 0xff; /* Transfer length */ SCpnt->cmnd[28] = (unsigned char) (this_count >> 24) & 0xff; SCpnt->cmnd[29] = (unsigned char) (this_count >> 16) & 0xff; SCpnt->cmnd[30] = (unsigned char) (this_count >> 8) & 0xff; SCpnt->cmnd[31] = (unsigned char) this_count & 0xff; } else if (sdp->use_16_for_rw || (this_count > 0xffff)) { SCpnt->cmnd[0] += READ_16 - READ_6; SCpnt->cmnd[1] = protect | ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0); SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0; SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0; SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0; SCpnt->cmnd[5] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0; SCpnt->cmnd[6] = (unsigned char) (block >> 24) & 0xff; SCpnt->cmnd[7] = (unsigned char) (block >> 16) & 0xff; SCpnt->cmnd[8] = (unsigned char) (block >> 8) & 0xff; SCpnt->cmnd[9] = (unsigned char) block & 0xff; SCpnt->cmnd[10] = (unsigned char) (this_count >> 24) & 0xff; SCpnt->cmnd[11] = (unsigned char) (this_count >> 16) & 0xff; SCpnt->cmnd[12] = (unsigned char) (this_count >> 8) & 0xff; SCpnt->cmnd[13] = (unsigned char) this_count & 0xff; SCpnt->cmnd[14] = SCpnt->cmnd[15] = 0; } else if ((this_count > 0xff) || (block > 0x1fffff) || scsi_device_protection(SCpnt->device) || SCpnt->device->use_10_for_rw) { SCpnt->cmnd[0] += READ_10 - READ_6; SCpnt->cmnd[1] = protect | ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0); SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff; SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff; SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff; SCpnt->cmnd[5] = (unsigned char) block & 0xff; SCpnt->cmnd[6] = SCpnt->cmnd[9] = 0; SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff; SCpnt->cmnd[8] = (unsigned char) this_count & 0xff; } else { if (unlikely(rq->cmd_flags & REQ_FUA)) { /* * This happens only if this drive failed * 10byte rw command with ILLEGAL_REQUEST * during operation and thus turned off * use_10_for_rw. */ scmd_printk(KERN_ERR, SCpnt, "FUA write on READ/WRITE(6) drive\n"); goto out; } SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f); SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff); SCpnt->cmnd[3] = (unsigned char) block & 0xff; SCpnt->cmnd[4] = (unsigned char) this_count; SCpnt->cmnd[5] = 0; } SCpnt->sdb.length = this_count * sdp->sector_size; /* * We shouldn't disconnect in the middle of a sector, so with a dumb * host adapter, it's safe to assume that we can at least transfer * this many bytes between each connect / disconnect. */ SCpnt->transfersize = sdp->sector_size; SCpnt->underflow = this_count << 9; SCpnt->allowed = SD_MAX_RETRIES; /* * This indicates that the command is ready from our end to be * queued. */ ret = BLKPREP_OK; out: if (zoned_write && ret != BLKPREP_OK) sd_zbc_write_unlock_zone(SCpnt); return ret; } static int sd_init_command(struct scsi_cmnd *cmd) { struct request *rq = cmd->request; switch (req_op(rq)) { case REQ_OP_DISCARD: switch (scsi_disk(rq->rq_disk)->provisioning_mode) { case SD_LBP_UNMAP: return sd_setup_unmap_cmnd(cmd); case SD_LBP_WS16: return sd_setup_write_same16_cmnd(cmd, true); case SD_LBP_WS10: return sd_setup_write_same10_cmnd(cmd, true); case SD_LBP_ZERO: return sd_setup_write_same10_cmnd(cmd, false); default: return BLKPREP_INVALID; } case REQ_OP_WRITE_ZEROES: return sd_setup_write_zeroes_cmnd(cmd); case REQ_OP_WRITE_SAME: return sd_setup_write_same_cmnd(cmd); case REQ_OP_FLUSH: return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_REPORT: return sd_zbc_setup_report_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_reset_cmnd(cmd); default: WARN_ON_ONCE(1); return BLKPREP_KILL; } } static void sd_uninit_command(struct scsi_cmnd *SCpnt) { struct request *rq = SCpnt->request; u8 *cmnd; if (SCpnt->flags & SCMD_ZONE_WRITE_LOCK) sd_zbc_write_unlock_zone(SCpnt); if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) mempool_free(rq->special_vec.bv_page, sd_page_pool); if (SCpnt->cmnd != scsi_req(rq)->cmd) { cmnd = SCpnt->cmnd; SCpnt->cmnd = NULL; SCpnt->cmd_len = 0; mempool_free(cmnd, sd_cdb_pool); } } /** * sd_open - open a scsi disk device * @bdev: Block device of the scsi disk to open * @mode: FMODE_* mask * * Returns 0 if successful. Returns a negated errno value in case * of error. * * Note: This can be called from a user context (e.g. fsck(1) ) * or from within the kernel (e.g. as a result of a mount(1) ). * In the latter case @inode and @filp carry an abridged amount * of information as noted above. * * Locking: called with bdev->bd_mutex held. **/ static int sd_open(struct block_device *bdev, fmode_t mode) { struct scsi_disk *sdkp = scsi_disk_get(bdev->bd_disk); struct scsi_device *sdev; int retval; if (!sdkp) return -ENXIO; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n")); sdev = sdkp->device; /* * If the device is in error recovery, wait until it is done. * If the device is offline, then disallow any access to it. */ retval = -ENXIO; if (!scsi_block_when_processing_errors(sdev)) goto error_out; if (sdev->removable || sdkp->write_prot) check_disk_change(bdev); /* * If the drive is empty, just let the open fail. */ retval = -ENOMEDIUM; if (sdev->removable && !sdkp->media_present && !(mode & FMODE_NDELAY)) goto error_out; /* * If the device has the write protect tab set, have the open fail * if the user expects to be able to write to the thing. */ retval = -EROFS; if (sdkp->write_prot && (mode & FMODE_WRITE)) goto error_out; /* * It is possible that the disk changing stuff resulted in * the device being taken offline. If this is the case, * report this to the user, and don't pretend that the * open actually succeeded. */ retval = -ENXIO; if (!scsi_device_online(sdev)) goto error_out; if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT); } return 0; error_out: scsi_disk_put(sdkp); return retval; } /** * sd_release - invoked when the (last) close(2) is called on this * scsi disk. * @disk: disk to release * @mode: FMODE_* mask * * Returns 0. * * Note: may block (uninterruptible) if error recovery is underway * on this disk. * * Locking: called with bdev->bd_mutex held. **/ static void sd_release(struct gendisk *disk, fmode_t mode) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n")); if (atomic_dec_return(&sdkp->openers) == 0 && sdev->removable) { if (scsi_block_when_processing_errors(sdev)) scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW); } scsi_disk_put(sdkp); } static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk); struct scsi_device *sdp = sdkp->device; struct Scsi_Host *host = sdp->host; sector_t capacity = logical_to_sectors(sdp, sdkp->capacity); int diskinfo[4]; /* default to most commonly used values */ diskinfo[0] = 0x40; /* 1 << 6 */ diskinfo[1] = 0x20; /* 1 << 5 */ diskinfo[2] = capacity >> 11; /* override with calculated, extended default, or driver values */ if (host->hostt->bios_param) host->hostt->bios_param(sdp, bdev, capacity, diskinfo); else scsicam_bios_param(bdev, capacity, diskinfo); geo->heads = diskinfo[0]; geo->sectors = diskinfo[1]; geo->cylinders = diskinfo[2]; return 0; } /** * sd_ioctl - process an ioctl * @bdev: target block device * @mode: FMODE_* mask * @cmd: ioctl command number * @arg: this is third argument given to ioctl(2) system call. * Often contains a pointer. * * Returns 0 if successful (some ioctls return positive numbers on * success as well). Returns a negated errno value in case of error. * * Note: most ioctls are forward onto the block subsystem or further * down in the scsi subsystem. **/ static int sd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; void __user *p = (void __user *)arg; int error; SCSI_LOG_IOCTL(1, sd_printk(KERN_INFO, sdkp, "sd_ioctl: disk=%s, " "cmd=0x%x\n", disk->disk_name, cmd)); error = scsi_verify_blk_ioctl(bdev, cmd); if (error < 0) return error; /* * If we are in the middle of error recovery, don't let anyone * else try and use this device. Also, if error recovery fails, it * may try and take the device offline, in which case all further * access to the device is prohibited. */ error = scsi_ioctl_block_when_processing_errors(sdp, cmd, (mode & FMODE_NDELAY) != 0); if (error) goto out; if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); /* * Send SCSI addressing ioctls directly to mid level, send other * ioctls to block level and then onto mid level if they can't be * resolved. */ switch (cmd) { case SCSI_IOCTL_GET_IDLUN: case SCSI_IOCTL_GET_BUS_NUMBER: error = scsi_ioctl(sdp, cmd, p); break; default: error = scsi_cmd_blk_ioctl(bdev, mode, cmd, p); if (error != -ENOTTY) break; error = scsi_ioctl(sdp, cmd, p); break; } out: return error; } static void set_media_not_present(struct scsi_disk *sdkp) { if (sdkp->media_present) sdkp->device->changed = 1; if (sdkp->device->removable) { sdkp->media_present = 0; sdkp->capacity = 0; } } static int media_not_present(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { if (!scsi_sense_valid(sshdr)) return 0; /* not invoked for commands that could return deferred errors */ switch (sshdr->sense_key) { case UNIT_ATTENTION: case NOT_READY: /* medium not present */ if (sshdr->asc == 0x3A) { set_media_not_present(sdkp); return 1; } } return 0; } /** * sd_check_events - check media events * @disk: kernel device descriptor * @clearing: disk events currently being cleared * * Returns mask of DISK_EVENT_*. * * Note: this function is invoked from the block subsystem. **/ static unsigned int sd_check_events(struct gendisk *disk, unsigned int clearing) { struct scsi_disk *sdkp = scsi_disk_get(disk); struct scsi_device *sdp; int retval; if (!sdkp) return 0; sdp = sdkp->device; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_check_events\n")); /* * If the device is offline, don't send any commands - just pretend as * if the command failed. If the device ever comes back online, we * can deal with it then. It is only because of unrecoverable errors * that we would ever take a device offline in the first place. */ if (!scsi_device_online(sdp)) { set_media_not_present(sdkp); goto out; } /* * Using TEST_UNIT_READY enables differentiation between drive with * no cartridge loaded - NOT READY, drive with changed cartridge - * UNIT ATTENTION, or with same cartridge - GOOD STATUS. * * Drives that auto spin down. eg iomega jaz 1G, will be started * by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever * sd_revalidate() is called. */ if (scsi_block_when_processing_errors(sdp)) { struct scsi_sense_hdr sshdr = { 0, }; retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, SD_MAX_RETRIES, &sshdr); /* failed to execute TUR, assume media not present */ if (host_byte(retval)) { set_media_not_present(sdkp); goto out; } if (media_not_present(sdkp, &sshdr)) goto out; } /* * For removable scsi disk we have to recognise the presence * of a disk in the drive. */ if (!sdkp->media_present) sdp->changed = 1; sdkp->media_present = 1; out: /* * sdp->changed is set under the following conditions: * * Medium present state has changed in either direction. * Device has indicated UNIT_ATTENTION. */ retval = sdp->changed ? DISK_EVENT_MEDIA_CHANGE : 0; sdp->changed = 0; scsi_disk_put(sdkp); return retval; } static int sd_sync_cache(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { int retries, res; struct scsi_device *sdp = sdkp->device; const int timeout = sdp->request_queue->rq_timeout * SD_FLUSH_TIMEOUT_MULTIPLIER; struct scsi_sense_hdr my_sshdr; if (!scsi_device_online(sdp)) return -ENODEV; /* caller might not be interested in sense, but we need it */ if (!sshdr) sshdr = &my_sshdr; for (retries = 3; retries > 0; --retries) { unsigned char cmd[10] = { 0 }; cmd[0] = SYNCHRONIZE_CACHE; /* * Leave the rest of the command zero to indicate * flush everything. */ res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, sshdr, timeout, SD_MAX_RETRIES, 0, RQF_PM, NULL); if (res == 0) break; } if (res) { sd_print_result(sdkp, "Synchronize Cache(10) failed", res); if (driver_byte(res) & DRIVER_SENSE) sd_print_sense_hdr(sdkp, sshdr); /* we need to evaluate the error return */ if (scsi_sense_valid(sshdr) && (sshdr->asc == 0x3a || /* medium not present */ sshdr->asc == 0x20 || /* invalid command */ (sshdr->asc == 0x74 && sshdr->ascq == 0x71))) /* drive is password locked */ /* this is no error here */ return 0; switch (host_byte(res)) { /* ignore errors due to racing a disconnection */ case DID_BAD_TARGET: case DID_NO_CONNECT: return 0; /* signal the upper layer it might try again */ case DID_BUS_BUSY: case DID_IMM_RETRY: case DID_REQUEUE: case DID_SOFT_ERROR: return -EBUSY; default: return -EIO; } } return 0; } static void sd_rescan(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); revalidate_disk(sdkp->disk); } #ifdef CONFIG_COMPAT /* * This gets directly called from VFS. When the ioctl * is not recognized we go back to the other translation paths. */ static int sd_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdev = sdkp->device; void __user *p = compat_ptr(arg); int error; error = scsi_verify_blk_ioctl(bdev, cmd); if (error < 0) return error; error = scsi_ioctl_block_when_processing_errors(sdev, cmd, (mode & FMODE_NDELAY) != 0); if (error) return error; if (is_sed_ioctl(cmd)) return sed_ioctl(sdkp->opal_dev, cmd, p); /* * Let the static ioctl translation table take care of it. */ if (!sdev->host->hostt->compat_ioctl) return -ENOIOCTLCMD; return sdev->host->hostt->compat_ioctl(sdev, cmd, p); } #endif static char sd_pr_type(enum pr_type type) { switch (type) { case PR_WRITE_EXCLUSIVE: return 0x01; case PR_EXCLUSIVE_ACCESS: return 0x03; case PR_WRITE_EXCLUSIVE_REG_ONLY: return 0x05; case PR_EXCLUSIVE_ACCESS_REG_ONLY: return 0x06; case PR_WRITE_EXCLUSIVE_ALL_REGS: return 0x07; case PR_EXCLUSIVE_ACCESS_ALL_REGS: return 0x08; default: return 0; } }; static int sd_pr_command(struct block_device *bdev, u8 sa, u64 key, u64 sa_key, u8 type, u8 flags) { struct scsi_device *sdev = scsi_disk(bdev->bd_disk)->device; struct scsi_sense_hdr sshdr; int result; u8 cmd[16] = { 0, }; u8 data[24] = { 0, }; cmd[0] = PERSISTENT_RESERVE_OUT; cmd[1] = sa; cmd[2] = type; put_unaligned_be32(sizeof(data), &cmd[5]); put_unaligned_be64(key, &data[0]); put_unaligned_be64(sa_key, &data[8]); data[20] = flags; result = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, &data, sizeof(data), &sshdr, SD_TIMEOUT, SD_MAX_RETRIES, NULL); if ((driver_byte(result) & DRIVER_SENSE) && (scsi_sense_valid(&sshdr))) { sdev_printk(KERN_INFO, sdev, "PR command failed: %d\n", result); scsi_print_sense_hdr(sdev, NULL, &sshdr); } return result; } static int sd_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, u32 flags) { if (flags & ~PR_FL_IGNORE_KEY) return -EOPNOTSUPP; return sd_pr_command(bdev, (flags & PR_FL_IGNORE_KEY) ? 0x06 : 0x00, old_key, new_key, 0, (1 << 0) /* APTPL */); } static int sd_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, u32 flags) { if (flags) return -EOPNOTSUPP; return sd_pr_command(bdev, 0x01, key, 0, sd_pr_type(type), 0); } static int sd_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { return sd_pr_command(bdev, 0x02, key, 0, sd_pr_type(type), 0); } static int sd_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, enum pr_type type, bool abort) { return sd_pr_command(bdev, abort ? 0x05 : 0x04, old_key, new_key, sd_pr_type(type), 0); } static int sd_pr_clear(struct block_device *bdev, u64 key) { return sd_pr_command(bdev, 0x03, key, 0, 0, 0); } static const struct pr_ops sd_pr_ops = { .pr_register = sd_pr_register, .pr_reserve = sd_pr_reserve, .pr_release = sd_pr_release, .pr_preempt = sd_pr_preempt, .pr_clear = sd_pr_clear, }; static const struct block_device_operations sd_fops = { .owner = THIS_MODULE, .open = sd_open, .release = sd_release, .ioctl = sd_ioctl, .getgeo = sd_getgeo, #ifdef CONFIG_COMPAT .compat_ioctl = sd_compat_ioctl, #endif .check_events = sd_check_events, .revalidate_disk = sd_revalidate_disk, .unlock_native_capacity = sd_unlock_native_capacity, .pr_ops = &sd_pr_ops, }; /** * sd_eh_reset - reset error handling callback * @scmd: sd-issued command that has failed * * This function is called by the SCSI midlayer before starting * SCSI EH. When counting medium access failures we have to be * careful to register it only only once per device and SCSI EH run; * there might be several timed out commands which will cause the * 'max_medium_access_timeouts' counter to trigger after the first * SCSI EH run already and set the device to offline. * So this function resets the internal counter before starting SCSI EH. **/ static void sd_eh_reset(struct scsi_cmnd *scmd) { struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk); /* New SCSI EH run, reset gate variable */ sdkp->ignore_medium_access_errors = false; } /** * sd_eh_action - error handling callback * @scmd: sd-issued command that has failed * @eh_disp: The recovery disposition suggested by the midlayer * * This function is called by the SCSI midlayer upon completion of an * error test command (currently TEST UNIT READY). The result of sending * the eh command is passed in eh_disp. We're looking for devices that * fail medium access commands but are OK with non access commands like * test unit ready (so wrongly see the device as having a successful * recovery) **/ static int sd_eh_action(struct scsi_cmnd *scmd, int eh_disp) { struct scsi_disk *sdkp = scsi_disk(scmd->request->rq_disk); struct scsi_device *sdev = scmd->device; if (!scsi_device_online(sdev) || !scsi_medium_access_command(scmd) || host_byte(scmd->result) != DID_TIME_OUT || eh_disp != SUCCESS) return eh_disp; /* * The device has timed out executing a medium access command. * However, the TEST UNIT READY command sent during error * handling completed successfully. Either the device is in the * process of recovering or has it suffered an internal failure * that prevents access to the storage medium. */ if (!sdkp->ignore_medium_access_errors) { sdkp->medium_access_timed_out++; sdkp->ignore_medium_access_errors = true; } /* * If the device keeps failing read/write commands but TEST UNIT * READY always completes successfully we assume that medium * access is no longer possible and take the device offline. */ if (sdkp->medium_access_timed_out >= sdkp->max_medium_access_timeouts) { scmd_printk(KERN_ERR, scmd, "Medium access timeout failure. Offlining disk!\n"); mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); return SUCCESS; } return eh_disp; } static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd) { struct request *req = scmd->request; struct scsi_device *sdev = scmd->device; unsigned int transferred, good_bytes; u64 start_lba, end_lba, bad_lba; /* * Some commands have a payload smaller than the device logical * block size (e.g. INQUIRY on a 4K disk). */ if (scsi_bufflen(scmd) <= sdev->sector_size) return 0; /* Check if we have a 'bad_lba' information */ if (!scsi_get_sense_info_fld(scmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, &bad_lba)) return 0; /* * If the bad lba was reported incorrectly, we have no idea where * the error is. */ start_lba = sectors_to_logical(sdev, blk_rq_pos(req)); end_lba = start_lba + bytes_to_logical(sdev, scsi_bufflen(scmd)); if (bad_lba < start_lba || bad_lba >= end_lba) return 0; /* * resid is optional but mostly filled in. When it's unused, * its value is zero, so we assume the whole buffer transferred */ transferred = scsi_bufflen(scmd) - scsi_get_resid(scmd); /* This computation should always be done in terms of the * resolution of the device's medium. */ good_bytes = logical_to_bytes(sdev, bad_lba - start_lba); return min(good_bytes, transferred); } /** * sd_done - bottom half handler: called when the lower level * driver has completed (successfully or otherwise) a scsi command. * @SCpnt: mid-level's per command structure. * * Note: potentially run from within an ISR. Must not block. **/ static int sd_done(struct scsi_cmnd *SCpnt) { int result = SCpnt->result; unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt); unsigned int sector_size = SCpnt->device->sector_size; unsigned int resid; struct scsi_sense_hdr sshdr; struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk); struct request *req = SCpnt->request; int sense_valid = 0; int sense_deferred = 0; switch (req_op(req)) { case REQ_OP_DISCARD: case REQ_OP_WRITE_ZEROES: case REQ_OP_WRITE_SAME: case REQ_OP_ZONE_RESET: if (!result) { good_bytes = blk_rq_bytes(req); scsi_set_resid(SCpnt, 0); } else { good_bytes = 0; scsi_set_resid(SCpnt, blk_rq_bytes(req)); } break; case REQ_OP_ZONE_REPORT: /* To avoid that the block layer performs an incorrect * bio_advance() call and restart of the remainder of * incomplete report zone BIOs, always indicate a full * completion of REQ_OP_ZONE_REPORT. */ if (!result) { good_bytes = scsi_bufflen(SCpnt); scsi_set_resid(SCpnt, 0); } else { good_bytes = 0; scsi_set_resid(SCpnt, blk_rq_bytes(req)); } break; default: /* * In case of bogus fw or device, we could end up having * an unaligned partial completion. Check this here and force * alignment. */ resid = scsi_get_resid(SCpnt); if (resid & (sector_size - 1)) { sd_printk(KERN_INFO, sdkp, "Unaligned partial completion (resid=%u, sector_sz=%u)\n", resid, sector_size); resid = min(scsi_bufflen(SCpnt), round_up(resid, sector_size)); scsi_set_resid(SCpnt, resid); } } if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) sense_deferred = scsi_sense_is_deferred(&sshdr); } sdkp->medium_access_timed_out = 0; if (driver_byte(result) != DRIVER_SENSE && (!sense_valid || sense_deferred)) goto out; switch (sshdr.sense_key) { case HARDWARE_ERROR: case MEDIUM_ERROR: good_bytes = sd_completed_bytes(SCpnt); break; case RECOVERED_ERROR: good_bytes = scsi_bufflen(SCpnt); break; case NO_SENSE: /* This indicates a false check condition, so ignore it. An * unknown amount of data was transferred so treat it as an * error. */ SCpnt->result = 0; memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); break; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF: Target detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case ILLEGAL_REQUEST: switch (sshdr.asc) { case 0x10: /* DIX: Host detected corruption */ good_bytes = sd_completed_bytes(SCpnt); break; case 0x20: /* INVALID COMMAND OPCODE */ case 0x24: /* INVALID FIELD IN CDB */ switch (SCpnt->cmnd[0]) { case UNMAP: sd_config_discard(sdkp, SD_LBP_DISABLE); break; case WRITE_SAME_16: case WRITE_SAME: if (SCpnt->cmnd[1] & 8) { /* UNMAP */ sd_config_discard(sdkp, SD_LBP_DISABLE); } else { sdkp->device->no_write_same = 1; sd_config_write_same(sdkp); req->__data_len = blk_rq_bytes(req); req->rq_flags |= RQF_QUIET; } break; } } break; default: break; } out: if (sd_is_zoned(sdkp)) sd_zbc_complete(SCpnt, good_bytes, &sshdr); SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt, "sd_done: completed %d of %d bytes\n", good_bytes, scsi_bufflen(SCpnt))); if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt)) sd_dif_complete(SCpnt, good_bytes); return good_bytes; } /* * spinup disk - called only in sd_revalidate_disk() */ static void sd_spinup_disk(struct scsi_disk *sdkp) { unsigned char cmd[10]; unsigned long spintime_expire = 0; int retries, spintime; unsigned int the_result; struct scsi_sense_hdr sshdr; int sense_valid = 0; spintime = 0; /* Spin up drives, as required. Only do this at boot time */ /* Spinup needs to be done for module loads too. */ do { retries = 0; do { cmd[0] = TEST_UNIT_READY; memset((void *) &cmd[1], 0, 9); the_result = scsi_execute_req(sdkp->device, cmd, DMA_NONE, NULL, 0, &sshdr, SD_TIMEOUT, SD_MAX_RETRIES, NULL); /* * If the drive has indicated to us that it * doesn't have any media in it, don't bother * with any more polling. */ if (media_not_present(sdkp, &sshdr)) return; if (the_result) sense_valid = scsi_sense_valid(&sshdr); retries++; } while (retries < 3 && (!scsi_status_is_good(the_result) || ((driver_byte(the_result) & DRIVER_SENSE) && sense_valid && sshdr.sense_key == UNIT_ATTENTION))); if ((driver_byte(the_result) & DRIVER_SENSE) == 0) { /* no sense, TUR either succeeded or failed * with a status error */ if(!spintime && !scsi_status_is_good(the_result)) { sd_print_result(sdkp, "Test Unit Ready failed", the_result); } break; } /* * The device does not want the automatic start to be issued. */ if (sdkp->device->no_start_on_add) break; if (sense_valid && sshdr.sense_key == NOT_READY) { if (sshdr.asc == 4 && sshdr.ascq == 3) break; /* manual intervention required */ if (sshdr.asc == 4 && sshdr.ascq == 0xb) break; /* standby */ if (sshdr.asc == 4 && sshdr.ascq == 0xc) break; /* unavailable */ if (sshdr.asc == 4 && sshdr.ascq == 0x1b) break; /* sanitize in progress */ /* * Issue command to spin up drive when not ready */ if (!spintime) { sd_printk(KERN_NOTICE, sdkp, "Spinning up disk..."); cmd[0] = START_STOP; cmd[1] = 1; /* Return immediately */ memset((void *) &cmd[2], 0, 8); cmd[4] = 1; /* Start spin cycle */ if (sdkp->device->start_stop_pwr_cond) cmd[4] |= 1 << 4; scsi_execute_req(sdkp->device, cmd, DMA_NONE, NULL, 0, &sshdr, SD_TIMEOUT, SD_MAX_RETRIES, NULL); spintime_expire = jiffies + 100 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); printk("."); /* * Wait for USB flash devices with slow firmware. * Yes, this sense key/ASC combination shouldn't * occur here. It's characteristic of these devices. */ } else if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x28) { if (!spintime) { spintime_expire = jiffies + 5 * HZ; spintime = 1; } /* Wait 1 second for next try */ msleep(1000); } else { /* we don't understand the sense code, so it's * probably pointless to loop */ if(!spintime) { sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n"); sd_print_sense_hdr(sdkp, &sshdr); } break; } } while (spintime && time_before_eq(jiffies, spintime_expire)); if (spintime) { if (scsi_status_is_good(the_result)) printk("ready\n"); else printk("not responding...\n"); } } /* * Determine whether disk supports Data Integrity Field. */ static int sd_read_protection_type(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdp = sdkp->device; u8 type; int ret = 0; if (scsi_device_protection(sdp) == 0 || (buffer[12] & 1) == 0) { sdkp->protection_type = 0; return ret; } type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */ if (type > T10_PI_TYPE3_PROTECTION) ret = -ENODEV; else if (scsi_host_dif_capable(sdp->host, type)) ret = 1; if (sdkp->first_scan || type != sdkp->protection_type) switch (ret) { case -ENODEV: sd_printk(KERN_ERR, sdkp, "formatted with unsupported" \ " protection type %u. Disabling disk!\n", type); break; case 1: sd_printk(KERN_NOTICE, sdkp, "Enabling DIF Type %u protection\n", type); break; case 0: sd_printk(KERN_NOTICE, sdkp, "Disabling DIF Type %u protection\n", type); break; } sdkp->protection_type = type; return ret; } static void read_capacity_error(struct scsi_disk *sdkp, struct scsi_device *sdp, struct scsi_sense_hdr *sshdr, int sense_valid, int the_result) { if (driver_byte(the_result) & DRIVER_SENSE) sd_print_sense_hdr(sdkp, sshdr); else sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n"); /* * Set dirty bit for removable devices if not ready - * sometimes drives will not report this properly. */ if (sdp->removable && sense_valid && sshdr->sense_key == NOT_READY) set_media_not_present(sdkp); /* * We used to set media_present to 0 here to indicate no media * in the drive, but some drives fail read capacity even with * media present, so we can't do that. */ sdkp->capacity = 0; /* unknown mapped to zero - as usual */ } #define RC16_LEN 32 #if RC16_LEN > SD_BUF_SIZE #error RC16_LEN must not be more than SD_BUF_SIZE #endif #define READ_CAPACITY_RETRIES_ON_RESET 10 /* * Ensure that we don't overflow sector_t when CONFIG_LBDAF is not set * and the reported logical block size is bigger than 512 bytes. Note * that last_sector is a u64 and therefore logical_to_sectors() is not * applicable. */ static bool sd_addressable_capacity(u64 lba, unsigned int sector_size) { u64 last_sector = (lba + 1ULL) << (ilog2(sector_size) - 9); if (sizeof(sector_t) == 4 && last_sector > U32_MAX) return false; return true; } static int read_capacity_16(struct scsi_disk *sdkp, struct scsi_device *sdp, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; int sense_valid = 0; int the_result; int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET; unsigned int alignment; unsigned long long lba; unsigned sector_size; if (sdp->no_read_capacity_16) return -EINVAL; do { memset(cmd, 0, 16); cmd[0] = SERVICE_ACTION_IN_16; cmd[1] = SAI_READ_CAPACITY_16; cmd[13] = RC16_LEN; memset(buffer, 0, RC16_LEN); the_result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, buffer, RC16_LEN, &sshdr, SD_TIMEOUT, SD_MAX_RETRIES, NULL); if (media_not_present(sdkp, &sshdr)) return -ENODEV; if (the_result) { sense_valid = scsi_sense_valid(&sshdr); if (sense_valid && sshdr.sense_key == ILLEGAL_REQUEST && (sshdr.asc == 0x20 || sshdr.asc == 0x24) && sshdr.ascq == 0x00) /* Invalid Command Operation Code or * Invalid Field in CDB, just retry * silently with RC10 */ return -EINVAL; if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29 && sshdr.ascq == 0x00) /* Device reset might occur several times, * give it one more chance */ if (--reset_retries > 0) continue; } retries--; } while (the_result && retries); if (the_result) { sd_print_result(sdkp, "Read Capacity(16) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[8]); lba = get_unaligned_be64(&buffer[0]); if (sd_read_protection_type(sdkp, buffer) < 0) { sdkp->capacity = 0; return -ENODEV; } if (!sd_addressable_capacity(lba, sector_size)) { sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a " "kernel compiled with support for large block " "devices.\n"); sdkp->capacity = 0; return -EOVERFLOW; } /* Logical blocks per physical block exponent */ sdkp->physical_block_size = (1 << (buffer[13] & 0xf)) * sector_size; /* RC basis */ sdkp->rc_basis = (buffer[12] >> 4) & 0x3; /* Lowest aligned logical block */ alignment = ((buffer[14] & 0x3f) << 8 | buffer[15]) * sector_size; blk_queue_alignment_offset(sdp->request_queue, alignment); if (alignment && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "physical block alignment offset: %u\n", alignment); if (buffer[14] & 0x80) { /* LBPME */ sdkp->lbpme = 1; if (buffer[14] & 0x40) /* LBPRZ */ sdkp->lbprz = 1; sd_config_discard(sdkp, SD_LBP_WS16); } sdkp->capacity = lba + 1; return sector_size; } static int read_capacity_10(struct scsi_disk *sdkp, struct scsi_device *sdp, unsigned char *buffer) { unsigned char cmd[16]; struct scsi_sense_hdr sshdr; int sense_valid = 0; int the_result; int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET; sector_t lba; unsigned sector_size; do { cmd[0] = READ_CAPACITY; memset(&cmd[1], 0, 9); memset(buffer, 0, 8); the_result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE, buffer, 8, &sshdr, SD_TIMEOUT, SD_MAX_RETRIES, NULL); if (media_not_present(sdkp, &sshdr)) return -ENODEV; if (the_result) { sense_valid = scsi_sense_valid(&sshdr); if (sense_valid && sshdr.sense_key == UNIT_ATTENTION && sshdr.asc == 0x29 && sshdr.ascq == 0x00) /* Device reset might occur several times, * give it one more chance */ if (--reset_retries > 0) continue; } retries--; } while (the_result && retries); if (the_result) { sd_print_result(sdkp, "Read Capacity(10) failed", the_result); read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result); return -EINVAL; } sector_size = get_unaligned_be32(&buffer[4]); lba = get_unaligned_be32(&buffer[0]); if (sdp->no_read_capacity_16 && (lba == 0xffffffff)) { /* Some buggy (usb cardreader) devices return an lba of 0xffffffff when the want to report a size of 0 (with which they really mean no media is present) */ sdkp->capacity = 0; sdkp->physical_block_size = sector_size; return sector_size; } if (!sd_addressable_capacity(lba, sector_size)) { sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a " "kernel compiled with support for large block " "devices.\n"); sdkp->capacity = 0; return -EOVERFLOW; } sdkp->capacity = lba + 1; sdkp->physical_block_size = sector_size; return sector_size; } static int sd_try_rc16_first(struct scsi_device *sdp) { if (sdp->host->max_cmd_len < 16) return 0; if (sdp->try_rc_10_first) return 0; if (sdp->scsi_level > SCSI_SPC_2) return 1; if (scsi_device_protection(sdp)) return 1; return 0; } /* * read disk capacity */ static void sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer) { int sector_size; struct scsi_device *sdp = sdkp->device; if (sd_try_rc16_first(sdp)) { sector_size = read_capacity_16(sdkp, sdp, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size == -ENODEV) return; if (sector_size < 0) sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size < 0) return; } else { sector_size = read_capacity_10(sdkp, sdp, buffer); if (sector_size == -EOVERFLOW) goto got_data; if (sector_size < 0) return; if ((sizeof(sdkp->capacity) > 4) && (sdkp->capacity > 0xffffffffULL)) { int old_sector_size = sector_size; sd_printk(KERN_NOTICE, sdkp, "Very big device. " "Trying to use READ CAPACITY(16).\n"); sector_size = read_capacity_16(sdkp, sdp, buffer); if (sector_size < 0) { sd_printk(KERN_NOTICE, sdkp, "Using 0xffffffff as device size\n"); sdkp->capacity = 1 + (sector_t) 0xffffffff; sector_size = old_sector_size; goto got_data; } /* Remember that READ CAPACITY(16) succeeded */ sdp->try_rc_10_first = 0; } } /* Some devices are known to return the total number of blocks, * not the highest block number. Some devices have versions * which do this and others which do not. Some devices we might * suspect of doing this but we don't know for certain. * * If we know the reported capacity is wrong, decrement it. If * we can only guess, then assume the number of blocks is even * (usually true but not always) and err on the side of lowering * the capacity. */ if (sdp->fix_capacity || (sdp->guess_capacity && (sdkp->capacity & 0x01))) { sd_printk(KERN_INFO, sdkp, "Adjusting the sector count " "from its reported value: %llu\n", (unsigned long long) sdkp->capacity); --sdkp->capacity; } got_data: if (sector_size == 0) { sector_size = 512; sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, " "assuming 512.\n"); } if (sector_size != 512 && sector_size != 1024 && sector_size != 2048 && sector_size != 4096) { sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n", sector_size); /* * The user might want to re-format the drive with * a supported sectorsize. Once this happens, it * would be relatively trivial to set the thing up. * For this reason, we leave the thing in the table. */ sdkp->capacity = 0; /* * set a bogus sector size so the normal read/write * logic in the block layer will eventually refuse any * request on this device without tripping over power * of two sector size assumptions */ sector_size = 512; } blk_queue_logical_block_size(sdp->request_queue, sector_size); blk_queue_physical_block_size(sdp->request_queue, sdkp->physical_block_size); sdkp->device->sector_size = sector_size; if (sdkp->capacity > 0xffffffff) sdp->use_16_for_rw = 1; } /* * Print disk capacity */ static void sd_print_capacity(struct scsi_disk *sdkp, sector_t old_capacity) { int sector_size = sdkp->device->sector_size; char cap_str_2[10], cap_str_10[10]; string_get_size(sdkp->capacity, sector_size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); string_get_size(sdkp->capacity, sector_size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); if (sdkp->first_scan || old_capacity != sdkp->capacity) { sd_printk(KERN_NOTICE, sdkp, "%llu %d-byte logical blocks: (%s/%s)\n", (unsigned long long)sdkp->capacity, sector_size, cap_str_10, cap_str_2); if (sdkp->physical_block_size != sector_size) sd_printk(KERN_NOTICE, sdkp, "%u-byte physical blocks\n", sdkp->physical_block_size); sd_zbc_print_zones(sdkp); } } /* called with buffer of length 512 */ static inline int sd_do_mode_sense(struct scsi_device *sdp, int dbd, int modepage, unsigned char *buffer, int len, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { return scsi_mode_sense(sdp, dbd, modepage, buffer, len, SD_TIMEOUT, SD_MAX_RETRIES, data, sshdr); } /* * read write protect setting, if possible - called only in sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_write_protect_flag(struct scsi_disk *sdkp, unsigned char *buffer) { int res; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; int old_wp = sdkp->write_prot; set_disk_ro(sdkp->disk, 0); if (sdp->skip_ms_page_3f) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n"); return; } if (sdp->use_192_bytes_for_3f) { res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 192, &data, NULL); } else { /* * First attempt: ask for all pages (0x3F), but only 4 bytes. * We have to start carefully: some devices hang if we ask * for more than is available. */ res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 4, &data, NULL); /* * Second attempt: ask for page 0 When only page 0 is * implemented, a request for page 3F may return Sense Key * 5: Illegal Request, Sense Code 24: Invalid field in * CDB. */ if (!scsi_status_is_good(res)) res = sd_do_mode_sense(sdp, 0, 0, buffer, 4, &data, NULL); /* * Third attempt: ask 255 bytes, as we did earlier. */ if (!scsi_status_is_good(res)) res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 255, &data, NULL); } if (!scsi_status_is_good(res)) { sd_first_printk(KERN_WARNING, sdkp, "Test WP failed, assume Write Enabled\n"); } else { sdkp->write_prot = ((data.device_specific & 0x80) != 0); set_disk_ro(sdkp->disk, sdkp->write_prot); if (sdkp->first_scan || old_wp != sdkp->write_prot) { sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n", sdkp->write_prot ? "on" : "off"); sd_printk(KERN_DEBUG, sdkp, "Mode Sense: %4ph\n", buffer); } } } /* * sd_read_cache_type - called only from sd_revalidate_disk() * called with buffer of length SD_BUF_SIZE */ static void sd_read_cache_type(struct scsi_disk *sdkp, unsigned char *buffer) { int len = 0, res; struct scsi_device *sdp = sdkp->device; int dbd; int modepage; int first_len; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; int old_wce = sdkp->WCE; int old_rcd = sdkp->RCD; int old_dpofua = sdkp->DPOFUA; if (sdkp->cache_override) return; first_len = 4; if (sdp->skip_ms_page_8) { if (sdp->type == TYPE_RBC) goto defaults; else { if (sdp->skip_ms_page_3f) goto defaults; modepage = 0x3F; if (sdp->use_192_bytes_for_3f) first_len = 192; dbd = 0; } } else if (sdp->type == TYPE_RBC) { modepage = 6; dbd = 8; } else { modepage = 8; dbd = 0; } /* cautiously ask */ res = sd_do_mode_sense(sdp, dbd, modepage, buffer, first_len, &data, &sshdr); if (!scsi_status_is_good(res)) goto bad_sense; if (!data.header_length) { modepage = 6; first_len = 0; sd_first_printk(KERN_ERR, sdkp, "Missing header in MODE_SENSE response\n"); } /* that went OK, now ask for the proper length */ len = data.length; /* * We're only interested in the first three bytes, actually. * But the data cache page is defined for the first 20. */ if (len < 3) goto bad_sense; else if (len > SD_BUF_SIZE) { sd_first_printk(KERN_NOTICE, sdkp, "Truncating mode parameter " "data from %d to %d bytes\n", len, SD_BUF_SIZE); len = SD_BUF_SIZE; } if (modepage == 0x3F && sdp->use_192_bytes_for_3f) len = 192; /* Get the data */ if (len > first_len) res = sd_do_mode_sense(sdp, dbd, modepage, buffer, len, &data, &sshdr); if (scsi_status_is_good(res)) { int offset = data.header_length + data.block_descriptor_length; while (offset < len) { u8 page_code = buffer[offset] & 0x3F; u8 spf = buffer[offset] & 0x40; if (page_code == 8 || page_code == 6) { /* We're interested only in the first 3 bytes. */ if (len - offset <= 2) { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode parameter " "data\n"); goto defaults; } else { modepage = page_code; goto Page_found; } } else { /* Go to the next page */ if (spf && len - offset > 3) offset += 4 + (buffer[offset+2] << 8) + buffer[offset+3]; else if (!spf && len - offset > 1) offset += 2 + buffer[offset+1]; else { sd_first_printk(KERN_ERR, sdkp, "Incomplete mode " "parameter data\n"); goto defaults; } } } sd_first_printk(KERN_ERR, sdkp, "No Caching mode page found\n"); goto defaults; Page_found: if (modepage == 8) { sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0); sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0); } else { sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0); sdkp->RCD = 0; } sdkp->DPOFUA = (data.device_specific & 0x10) != 0; if (sdp->broken_fua) { sd_first_printk(KERN_NOTICE, sdkp, "Disabling FUA\n"); sdkp->DPOFUA = 0; } else if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw && !sdkp->device->use_16_for_rw) { sd_first_printk(KERN_NOTICE, sdkp, "Uses READ/WRITE(6), disabling FUA\n"); sdkp->DPOFUA = 0; } /* No cache flush allowed for write protected devices */ if (sdkp->WCE && sdkp->write_prot) sdkp->WCE = 0; if (sdkp->first_scan || old_wce != sdkp->WCE || old_rcd != sdkp->RCD || old_dpofua != sdkp->DPOFUA) sd_printk(KERN_NOTICE, sdkp, "Write cache: %s, read cache: %s, %s\n", sdkp->WCE ? "enabled" : "disabled", sdkp->RCD ? "disabled" : "enabled", sdkp->DPOFUA ? "supports DPO and FUA" : "doesn't support DPO or FUA"); return; } bad_sense: if (scsi_sense_valid(&sshdr) && sshdr.sense_key == ILLEGAL_REQUEST && sshdr.asc == 0x24 && sshdr.ascq == 0x0) /* Invalid field in CDB */ sd_first_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n"); else sd_first_printk(KERN_ERR, sdkp, "Asking for cache data failed\n"); defaults: if (sdp->wce_default_on) { sd_first_printk(KERN_NOTICE, sdkp, "Assuming drive cache: write back\n"); sdkp->WCE = 1; } else { sd_first_printk(KERN_ERR, sdkp, "Assuming drive cache: write through\n"); sdkp->WCE = 0; } sdkp->RCD = 0; sdkp->DPOFUA = 0; } /* * The ATO bit indicates whether the DIF application tag is available * for use by the operating system. */ static void sd_read_app_tag_own(struct scsi_disk *sdkp, unsigned char *buffer) { int res, offset; struct scsi_device *sdp = sdkp->device; struct scsi_mode_data data; struct scsi_sense_hdr sshdr; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC) return; if (sdkp->protection_type == 0) return; res = scsi_mode_sense(sdp, 1, 0x0a, buffer, 36, SD_TIMEOUT, SD_MAX_RETRIES, &data, &sshdr); if (!scsi_status_is_good(res) || !data.header_length || data.length < 6) { sd_first_printk(KERN_WARNING, sdkp, "getting Control mode page failed, assume no ATO\n"); if (scsi_sense_valid(&sshdr)) sd_print_sense_hdr(sdkp, &sshdr); return; } offset = data.header_length + data.block_descriptor_length; if ((buffer[offset] & 0x3f) != 0x0a) { sd_first_printk(KERN_ERR, sdkp, "ATO Got wrong page\n"); return; } if ((buffer[offset + 5] & 0x80) == 0) return; sdkp->ATO = 1; return; } /** * sd_read_block_limits - Query disk device for preferred I/O sizes. * @sdkp: disk to query */ static void sd_read_block_limits(struct scsi_disk *sdkp) { unsigned int sector_sz = sdkp->device->sector_size; const int vpd_len = 64; unsigned char *buffer = kmalloc(vpd_len, GFP_KERNEL); if (!buffer || /* Block Limits VPD */ scsi_get_vpd_page(sdkp->device, 0xb0, buffer, vpd_len)) goto out; blk_queue_io_min(sdkp->disk->queue, get_unaligned_be16(&buffer[6]) * sector_sz); sdkp->max_xfer_blocks = get_unaligned_be32(&buffer[8]); sdkp->opt_xfer_blocks = get_unaligned_be32(&buffer[12]); if (buffer[3] == 0x3c) { unsigned int lba_count, desc_count; sdkp->max_ws_blocks = (u32)get_unaligned_be64(&buffer[36]); if (!sdkp->lbpme) goto out; lba_count = get_unaligned_be32(&buffer[20]); desc_count = get_unaligned_be32(&buffer[24]); if (lba_count && desc_count) sdkp->max_unmap_blocks = lba_count; sdkp->unmap_granularity = get_unaligned_be32(&buffer[28]); if (buffer[32] & 0x80) sdkp->unmap_alignment = get_unaligned_be32(&buffer[32]) & ~(1 << 31); if (!sdkp->lbpvpd) { /* LBP VPD page not provided */ if (sdkp->max_unmap_blocks) sd_config_discard(sdkp, SD_LBP_UNMAP); else sd_config_discard(sdkp, SD_LBP_WS16); } else { /* LBP VPD page tells us what to use */ if (sdkp->lbpu && sdkp->max_unmap_blocks) sd_config_discard(sdkp, SD_LBP_UNMAP); else if (sdkp->lbpws) sd_config_discard(sdkp, SD_LBP_WS16); else if (sdkp->lbpws10) sd_config_discard(sdkp, SD_LBP_WS10); else sd_config_discard(sdkp, SD_LBP_DISABLE); } } out: kfree(buffer); } /** * sd_read_block_characteristics - Query block dev. characteristics * @sdkp: disk to query */ static void sd_read_block_characteristics(struct scsi_disk *sdkp) { struct request_queue *q = sdkp->disk->queue; unsigned char *buffer; u16 rot; const int vpd_len = 64; buffer = kmalloc(vpd_len, GFP_KERNEL); if (!buffer || /* Block Device Characteristics VPD */ scsi_get_vpd_page(sdkp->device, 0xb1, buffer, vpd_len)) goto out; rot = get_unaligned_be16(&buffer[4]); if (rot == 1) { queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); } if (sdkp->device->type == TYPE_ZBC) { /* Host-managed */ q->limits.zoned = BLK_ZONED_HM; } else { sdkp->zoned = (buffer[8] >> 4) & 3; if (sdkp->zoned == 1) /* Host-aware */ q->limits.zoned = BLK_ZONED_HA; else /* * Treat drive-managed devices as * regular block devices. */ q->limits.zoned = BLK_ZONED_NONE; } if (blk_queue_is_zoned(q) && sdkp->first_scan) sd_printk(KERN_NOTICE, sdkp, "Host-%s zoned block device\n", q->limits.zoned == BLK_ZONED_HM ? "managed" : "aware"); out: kfree(buffer); } /** * sd_read_block_provisioning - Query provisioning VPD page * @sdkp: disk to query */ static void sd_read_block_provisioning(struct scsi_disk *sdkp) { unsigned char *buffer; const int vpd_len = 8; if (sdkp->lbpme == 0) return; buffer = kmalloc(vpd_len, GFP_KERNEL); if (!buffer || scsi_get_vpd_page(sdkp->device, 0xb2, buffer, vpd_len)) goto out; sdkp->lbpvpd = 1; sdkp->lbpu = (buffer[5] >> 7) & 1; /* UNMAP */ sdkp->lbpws = (buffer[5] >> 6) & 1; /* WRITE SAME(16) with UNMAP */ sdkp->lbpws10 = (buffer[5] >> 5) & 1; /* WRITE SAME(10) with UNMAP */ out: kfree(buffer); } static void sd_read_write_same(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (sdev->host->no_write_same) { sdev->no_write_same = 1; return; } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, INQUIRY) < 0) { /* too large values might cause issues with arcmsr */ int vpd_buf_len = 64; sdev->no_report_opcodes = 1; /* Disable WRITE SAME if REPORT SUPPORTED OPERATION * CODES is unsupported and the device has an ATA * Information VPD page (SAT). */ if (!scsi_get_vpd_page(sdev, 0x89, buffer, vpd_buf_len)) sdev->no_write_same = 1; } if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME_16) == 1) sdkp->ws16 = 1; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, WRITE_SAME) == 1) sdkp->ws10 = 1; } static void sd_read_security(struct scsi_disk *sdkp, unsigned char *buffer) { struct scsi_device *sdev = sdkp->device; if (!sdev->security_supported) return; if (scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_IN) == 1 && scsi_report_opcode(sdev, buffer, SD_BUF_SIZE, SECURITY_PROTOCOL_OUT) == 1) sdkp->security = 1; } /* * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, not a * multiple of the physical block size, or simply garbage. */ static bool sd_validate_opt_xfer_size(struct scsi_disk *sdkp, unsigned int dev_max) { struct scsi_device *sdp = sdkp->device; unsigned int opt_xfer_bytes = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); if (sdkp->opt_xfer_blocks == 0) return false; if (sdkp->opt_xfer_blocks > dev_max) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks " \ "> dev_max (%u logical blocks)\n", sdkp->opt_xfer_blocks, dev_max); return false; } if (sdkp->opt_xfer_blocks > SD_DEF_XFER_BLOCKS) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u logical blocks " \ "> sd driver limit (%u logical blocks)\n", sdkp->opt_xfer_blocks, SD_DEF_XFER_BLOCKS); return false; } if (opt_xfer_bytes < PAGE_SIZE) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes < " \ "PAGE_SIZE (%u bytes)\n", opt_xfer_bytes, (unsigned int)PAGE_SIZE); return false; } if (opt_xfer_bytes & (sdkp->physical_block_size - 1)) { sd_first_printk(KERN_WARNING, sdkp, "Optimal transfer size %u bytes not a " \ "multiple of physical block size (%u bytes)\n", opt_xfer_bytes, sdkp->physical_block_size); return false; } sd_first_printk(KERN_INFO, sdkp, "Optimal transfer size %u bytes\n", opt_xfer_bytes); return true; } /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. * @disk: struct gendisk we care about **/ static int sd_revalidate_disk(struct gendisk *disk) { struct scsi_disk *sdkp = scsi_disk(disk); struct scsi_device *sdp = sdkp->device; struct request_queue *q = sdkp->disk->queue; sector_t old_capacity = sdkp->capacity; unsigned char *buffer; unsigned int dev_max, rw_max; SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_revalidate_disk\n")); /* * If the device is offline, don't try and read capacity or any * of the other niceties. */ if (!scsi_device_online(sdp)) goto out; buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL); if (!buffer) { sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory " "allocation failure.\n"); goto out; } sd_spinup_disk(sdkp); /* * Without media there is no reason to ask; moreover, some devices * react badly if we do. */ if (sdkp->media_present) { sd_read_capacity(sdkp, buffer); if (scsi_device_supports_vpd(sdp)) { sd_read_block_provisioning(sdkp); sd_read_block_limits(sdkp); sd_read_block_characteristics(sdkp); sd_zbc_read_zones(sdkp, buffer); } sd_print_capacity(sdkp, old_capacity); sd_read_write_protect_flag(sdkp, buffer); sd_read_cache_type(sdkp, buffer); sd_read_app_tag_own(sdkp, buffer); sd_read_write_same(sdkp, buffer); sd_read_security(sdkp, buffer); } /* * We now have all cache related info, determine how we deal * with flush requests. */ sd_set_flush_flag(sdkp); /* Initial block count limit based on CDB TRANSFER LENGTH field size. */ dev_max = sdp->use_16_for_rw ? SD_MAX_XFER_BLOCKS : SD_DEF_XFER_BLOCKS; /* Some devices report a maximum block count for READ/WRITE requests. */ dev_max = min_not_zero(dev_max, sdkp->max_xfer_blocks); q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); if (sd_validate_opt_xfer_size(sdkp, dev_max)) { q->limits.io_opt = logical_to_bytes(sdp, sdkp->opt_xfer_blocks); rw_max = logical_to_sectors(sdp, sdkp->opt_xfer_blocks); } else { q->limits.io_opt = 0; rw_max = min_not_zero(logical_to_sectors(sdp, dev_max), (sector_t)BLK_DEF_MAX_SECTORS); } /* Do not exceed controller limit */ rw_max = min(rw_max, queue_max_hw_sectors(q)); /* * Only update max_sectors if previously unset or if the current value * exceeds the capabilities of the hardware. */ if (sdkp->first_scan || q->limits.max_sectors > q->limits.max_dev_sectors || q->limits.max_sectors > q->limits.max_hw_sectors) q->limits.max_sectors = rw_max; sdkp->first_scan = 0; set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp); kfree(buffer); out: return 0; } /** * sd_unlock_native_capacity - unlock native capacity * @disk: struct gendisk to set capacity for * * Block layer calls this function if it detects that partitions * on @disk reach beyond the end of the device. If the SCSI host * implements ->unlock_native_capacity() method, it's invoked to * give it a chance to adjust the device capacity. * * CONTEXT: * Defined by block layer. Might sleep. */ static void sd_unlock_native_capacity(struct gendisk *disk) { struct scsi_device *sdev = scsi_disk(disk)->device; if (sdev->host->hostt->unlock_native_capacity) sdev->host->hostt->unlock_native_capacity(sdev); } /** * sd_format_disk_name - format disk name * @prefix: name prefix - ie. "sd" for SCSI disks * @index: index of the disk to format name for * @buf: output buffer * @buflen: length of the output buffer * * SCSI disk names starts at sda. The 26th device is sdz and the * 27th is sdaa. The last one for two lettered suffix is sdzz * which is followed by sdaaa. * * This is basically 26 base counting with one extra 'nil' entry * at the beginning from the second digit on and can be * determined using similar method as 26 base conversion with the * index shifted -1 after each digit is computed. * * CONTEXT: * Don't care. * * RETURNS: * 0 on success, -errno on failure. */ static int sd_format_disk_name(char *prefix, int index, char *buf, int buflen) { const int base = 'z' - 'a' + 1; char *begin = buf + strlen(prefix); char *end = buf + buflen; char *p; int unit; p = end - 1; *p = '\0'; unit = base; do { if (p == begin) return -EINVAL; *--p = 'a' + (index % unit); index = (index / unit) - 1; } while (index >= 0); memmove(begin, p, end - p); memcpy(buf, prefix, strlen(prefix)); return 0; } /* * The asynchronous part of sd_probe */ static void sd_probe_async(void *data, async_cookie_t cookie) { struct scsi_disk *sdkp = data; struct scsi_device *sdp; struct gendisk *gd; u32 index; struct device *dev; sdp = sdkp->device; gd = sdkp->disk; index = sdkp->index; dev = &sdp->sdev_gendev; gd->major = sd_major((index & 0xf0) >> 4); gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00); gd->fops = &sd_fops; gd->private_data = &sdkp->driver; gd->queue = sdkp->device->request_queue; /* defaults, until the device tells us otherwise */ sdp->sector_size = 512; sdkp->capacity = 0; sdkp->media_present = 1; sdkp->write_prot = 0; sdkp->cache_override = 0; sdkp->WCE = 0; sdkp->RCD = 0; sdkp->ATO = 0; sdkp->first_scan = 1; sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS; sd_revalidate_disk(gd); gd->flags = GENHD_FL_EXT_DEVT; if (sdp->removable) { gd->flags |= GENHD_FL_REMOVABLE; gd->events |= DISK_EVENT_MEDIA_CHANGE; } blk_pm_runtime_init(sdp->request_queue, dev); device_add_disk(dev, gd); if (sdkp->capacity) sd_dif_config_host(sdkp); sd_revalidate_disk(gd); if (sdkp->security) { sdkp->opal_dev = init_opal_dev(sdp, &sd_sec_submit); if (sdkp->opal_dev) sd_printk(KERN_NOTICE, sdkp, "supports TCG Opal\n"); } sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n", sdp->removable ? "removable " : ""); scsi_autopm_put_device(sdp); put_device(&sdkp->dev); } /** * sd_probe - called during driver initialization and whenever a * new scsi device is attached to the system. It is called once * for each scsi device (not just disks) present. * @dev: pointer to device object * * Returns 0 if successful (or not interested in this scsi device * (e.g. scanner)); 1 when there is an error. * * Note: this function is invoked from the scsi mid-level. * This function sets up the mapping between a given * <host,channel,id,lun> (found in sdp) and new device name * (e.g. /dev/sda). More precisely it is the block device major * and minor number that is chosen here. * * Assume sd_probe is not re-entrant (for time being) * Also think about sd_probe() and sd_remove() running coincidentally. **/ static int sd_probe(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); struct scsi_disk *sdkp; struct gendisk *gd; int index; int error; scsi_autopm_get_device(sdp); error = -ENODEV; if (sdp->type != TYPE_DISK && sdp->type != TYPE_ZBC && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC) goto out; #ifndef CONFIG_BLK_DEV_ZONED if (sdp->type == TYPE_ZBC) goto out; #endif SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp, "sd_probe\n")); error = -ENOMEM; sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL); if (!sdkp) goto out; gd = alloc_disk(SD_MINORS); if (!gd) goto out_free; do { if (!ida_pre_get(&sd_index_ida, GFP_KERNEL)) goto out_put; spin_lock(&sd_index_lock); error = ida_get_new(&sd_index_ida, &index); spin_unlock(&sd_index_lock); } while (error == -EAGAIN); if (error) { sdev_printk(KERN_WARNING, sdp, "sd_probe: memory exhausted.\n"); goto out_put; } error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN); if (error) { sdev_printk(KERN_WARNING, sdp, "SCSI disk (sd) name length exceeded.\n"); goto out_free_index; } sdkp->device = sdp; sdkp->driver = &sd_template; sdkp->disk = gd; sdkp->index = index; atomic_set(&sdkp->openers, 0); atomic_set(&sdkp->device->ioerr_cnt, 0); if (!sdp->request_queue->rq_timeout) { if (sdp->type != TYPE_MOD) blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT); else blk_queue_rq_timeout(sdp->request_queue, SD_MOD_TIMEOUT); } device_initialize(&sdkp->dev); sdkp->dev.parent = get_device(dev); sdkp->dev.class = &sd_disk_class; dev_set_name(&sdkp->dev, "%s", dev_name(dev)); error = device_add(&sdkp->dev); if (error) { put_device(&sdkp->dev); goto out; } dev_set_drvdata(dev, sdkp); get_device(&sdkp->dev); /* prevent release before async_schedule */ async_schedule_domain(sd_probe_async, sdkp, &scsi_sd_probe_domain); return 0; out_free_index: spin_lock(&sd_index_lock); ida_remove(&sd_index_ida, index); spin_unlock(&sd_index_lock); out_put: put_disk(gd); out_free: kfree(sdkp); out: scsi_autopm_put_device(sdp); return error; } /** * sd_remove - called whenever a scsi disk (previously recognized by * sd_probe) is detached from the system. It is called (potentially * multiple times) during sd module unload. * @dev: pointer to device object * * Note: this function is invoked from the scsi mid-level. * This function potentially frees up a device name (e.g. /dev/sdc) * that could be re-used by a subsequent sd_probe(). * This function is not called when the built-in sd driver is "exit-ed". **/ static int sd_remove(struct device *dev) { struct scsi_disk *sdkp; dev_t devt; sdkp = dev_get_drvdata(dev); devt = disk_devt(sdkp->disk); scsi_autopm_get_device(sdkp->device); async_synchronize_full_domain(&scsi_sd_pm_domain); async_synchronize_full_domain(&scsi_sd_probe_domain); device_del(&sdkp->dev); del_gendisk(sdkp->disk); sd_shutdown(dev); sd_zbc_remove(sdkp); free_opal_dev(sdkp->opal_dev); blk_register_region(devt, SD_MINORS, NULL, sd_default_probe, NULL, NULL); mutex_lock(&sd_ref_mutex); dev_set_drvdata(dev, NULL); put_device(&sdkp->dev); mutex_unlock(&sd_ref_mutex); return 0; } /** * scsi_disk_release - Called to free the scsi_disk structure * @dev: pointer to embedded class device * * sd_ref_mutex must be held entering this routine. Because it is * called on last put, you should always use the scsi_disk_get() * scsi_disk_put() helpers which manipulate the semaphore directly * and never do a direct put_device. **/ static void scsi_disk_release(struct device *dev) { struct scsi_disk *sdkp = to_scsi_disk(dev); struct gendisk *disk = sdkp->disk; struct request_queue *q = disk->queue; spin_lock(&sd_index_lock); ida_remove(&sd_index_ida, sdkp->index); spin_unlock(&sd_index_lock); /* * Wait until all requests that are in progress have completed. * This is necessary to avoid that e.g. scsi_end_request() crashes * due to clearing the disk->private_data pointer. Wait from inside * scsi_disk_release() instead of from sd_release() to avoid that * freezing and unfreezing the request queue affects user space I/O * in case multiple processes open a /dev/sd... node concurrently. */ blk_mq_freeze_queue(q); blk_mq_unfreeze_queue(q); disk->private_data = NULL; put_disk(disk); put_device(&sdkp->device->sdev_gendev); kfree(sdkp); } static int sd_start_stop_device(struct scsi_disk *sdkp, int start) { unsigned char cmd[6] = { START_STOP }; /* START_VALID */ struct scsi_sense_hdr sshdr; struct scsi_device *sdp = sdkp->device; int res; if (start) cmd[4] |= 1; /* START */ if (sdp->start_stop_pwr_cond) cmd[4] |= start ? 1 << 4 : 3 << 4; /* Active or Standby */ if (!scsi_device_online(sdp)) return -ENODEV; res = scsi_execute(sdp, cmd, DMA_NONE, NULL, 0, NULL, &sshdr, SD_TIMEOUT, SD_MAX_RETRIES, 0, RQF_PM, NULL); if (res) { sd_print_result(sdkp, "Start/Stop Unit failed", res); if (driver_byte(res) & DRIVER_SENSE) sd_print_sense_hdr(sdkp, &sshdr); if (scsi_sense_valid(&sshdr) && /* 0x3a is medium not present */ sshdr.asc == 0x3a) res = 0; } /* SCSI error codes must not go to the generic layer */ if (res) return -EIO; return 0; } /* * Send a SYNCHRONIZE CACHE instruction down to the device through * the normal SCSI command structure. Wait for the command to * complete. */ static void sd_shutdown(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); if (!sdkp) return; /* this can happen */ if (pm_runtime_suspended(dev)) return; if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); sd_sync_cache(sdkp, NULL); } if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) { sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); sd_start_stop_device(sdkp, 0); } } static int sd_suspend_common(struct device *dev, bool ignore_stop_errors) { struct scsi_disk *sdkp = dev_get_drvdata(dev); struct scsi_sense_hdr sshdr; int ret = 0; if (!sdkp) /* E.g.: runtime suspend following sd_remove() */ return 0; if (sdkp->WCE && sdkp->media_present) { sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n"); ret = sd_sync_cache(sdkp, &sshdr); if (ret) { /* ignore OFFLINE device */ if (ret == -ENODEV) return 0; if (!scsi_sense_valid(&sshdr) || sshdr.sense_key != ILLEGAL_REQUEST) return ret; /* * sshdr.sense_key == ILLEGAL_REQUEST means this drive * doesn't support sync. There's not much to do and * suspend shouldn't fail. */ ret = 0; } } if (sdkp->device->manage_start_stop) { sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n"); /* an error is not worth aborting a system sleep */ ret = sd_start_stop_device(sdkp, 0); if (ignore_stop_errors) ret = 0; } return ret; } static int sd_suspend_system(struct device *dev) { return sd_suspend_common(dev, true); } static int sd_suspend_runtime(struct device *dev) { return sd_suspend_common(dev, false); } static int sd_resume(struct device *dev) { struct scsi_disk *sdkp = dev_get_drvdata(dev); int ret; if (!sdkp) /* E.g.: runtime resume at the start of sd_probe() */ return 0; if (!sdkp->device->manage_start_stop) return 0; sd_printk(KERN_NOTICE, sdkp, "Starting disk\n"); ret = sd_start_stop_device(sdkp, 1); if (!ret) opal_unlock_from_suspend(sdkp->opal_dev); return ret; } /** * init_sd - entry point for this driver (both when built in or when * a module). * * Note: this function registers this driver with the scsi mid-level. **/ static int __init init_sd(void) { int majors = 0, i, err; SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n")); for (i = 0; i < SD_MAJORS; i++) { if (register_blkdev(sd_major(i), "sd") != 0) continue; majors++; blk_register_region(sd_major(i), SD_MINORS, NULL, sd_default_probe, NULL, NULL); } if (!majors) return -ENODEV; err = class_register(&sd_disk_class); if (err) goto err_out; sd_cdb_cache = kmem_cache_create("sd_ext_cdb", SD_EXT_CDB_SIZE, 0, 0, NULL); if (!sd_cdb_cache) { printk(KERN_ERR "sd: can't init extended cdb cache\n"); err = -ENOMEM; goto err_out_class; } sd_cdb_pool = mempool_create_slab_pool(SD_MEMPOOL_SIZE, sd_cdb_cache); if (!sd_cdb_pool) { printk(KERN_ERR "sd: can't init extended cdb pool\n"); err = -ENOMEM; goto err_out_cache; } sd_page_pool = mempool_create_page_pool(SD_MEMPOOL_SIZE, 0); if (!sd_page_pool) { printk(KERN_ERR "sd: can't init discard page pool\n"); err = -ENOMEM; goto err_out_ppool; } err = scsi_register_driver(&sd_template.gendrv); if (err) goto err_out_driver; return 0; err_out_driver: mempool_destroy(sd_page_pool); err_out_ppool: mempool_destroy(sd_cdb_pool); err_out_cache: kmem_cache_destroy(sd_cdb_cache); err_out_class: class_unregister(&sd_disk_class); err_out: for (i = 0; i < SD_MAJORS; i++) unregister_blkdev(sd_major(i), "sd"); return err; } /** * exit_sd - exit point for this driver (when it is a module). * * Note: this function unregisters this driver from the scsi mid-level. **/ static void __exit exit_sd(void) { int i; SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n")); scsi_unregister_driver(&sd_template.gendrv); mempool_destroy(sd_cdb_pool); mempool_destroy(sd_page_pool); kmem_cache_destroy(sd_cdb_cache); class_unregister(&sd_disk_class); for (i = 0; i < SD_MAJORS; i++) { blk_unregister_region(sd_major(i), SD_MINORS); unregister_blkdev(sd_major(i), "sd"); } } module_init(init_sd); module_exit(exit_sd); static void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr) { scsi_print_sense_hdr(sdkp->device, sdkp->disk ? sdkp->disk->disk_name : NULL, sshdr); } static void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result) { const char *hb_string = scsi_hostbyte_string(result); const char *db_string = scsi_driverbyte_string(result); if (hb_string || db_string) sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=%s driverbyte=%s\n", msg, hb_string ? hb_string : "invalid", db_string ? db_string : "invalid"); else sd_printk(KERN_INFO, sdkp, "%s: Result: hostbyte=0x%02x driverbyte=0x%02x\n", msg, host_byte(result), driver_byte(result)); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 /* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ /* Kernel module implementing an IP set type: the hash:ip,port,ip type */ #include <linux/jhash.h> #include <linux/module.h> #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/errno.h> #include <linux/random.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/tcp.h> #include <linux/netfilter.h> #include <linux/netfilter/ipset/pfxlen.h> #include <linux/netfilter/ipset/ip_set.h> #include <linux/netfilter/ipset/ip_set_getport.h> #include <linux/netfilter/ipset/ip_set_hash.h> #define IPSET_TYPE_REV_MIN 0 /* 1 SCTP and UDPLITE support added */ /* 2 Counters support added */ /* 3 Comments support added */ /* 4 Forceadd support added */ #define IPSET_TYPE_REV_MAX 5 /* skbinfo support added */ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>"); IP_SET_MODULE_DESC("hash:ip,port,ip", IPSET_TYPE_REV_MIN, IPSET_TYPE_REV_MAX); MODULE_ALIAS("ip_set_hash:ip,port,ip"); /* Type specific function prefix */ #define HTYPE hash_ipportip /* IPv4 variant */ /* Member elements */ struct hash_ipportip4_elem { __be32 ip; __be32 ip2; __be16 port; u8 proto; u8 padding; }; static inline bool hash_ipportip4_data_equal(const struct hash_ipportip4_elem *ip1, const struct hash_ipportip4_elem *ip2, u32 *multi) { return ip1->ip == ip2->ip && ip1->ip2 == ip2->ip2 && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip4_data_list(struct sk_buff *skb, const struct hash_ipportip4_elem *data) { if (nla_put_ipaddr4(skb, IPSET_ATTR_IP, data->ip) || nla_put_ipaddr4(skb, IPSET_ATTR_IP2, data->ip2) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static inline void hash_ipportip4_data_next(struct hash_ipportip4_elem *next, const struct hash_ipportip4_elem *d) { next->ip = d->ip; next->port = d->port; } /* Common functions */ #define MTYPE hash_ipportip4 #define HOST_MASK 32 #include "ip_set_hash_gen.h" static int hash_ipportip4_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip4_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip4addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip); ip4addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportip4 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip4_elem e = { .ip = 0 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip, ip_to = 0, p = 0, port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr4(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMP)) e.port = 0; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR] || tb[IPSET_ATTR_PORT_TO])) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } ip_to = ip = ntohl(e.ip); if (tb[IPSET_ATTR_IP_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; if (ip > ip_to) swap(ip, ip_to); } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (!cidr || cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; ip_set_mask_from_to(ip, ip_to, cidr); } port_to = port = ntohs(e.port); if (with_ports && tb[IPSET_ATTR_PORT_TO]) { port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); } if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { p = retried && ip == ntohl(h->next.ip) ? ntohs(h->next.port) : port; for (; p <= port_to; p++) { e.ip = htonl(ip); e.port = htons(p); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } } return ret; } /* IPv6 variant */ struct hash_ipportip6_elem { union nf_inet_addr ip; union nf_inet_addr ip2; __be16 port; u8 proto; u8 padding; }; /* Common functions */ static inline bool hash_ipportip6_data_equal(const struct hash_ipportip6_elem *ip1, const struct hash_ipportip6_elem *ip2, u32 *multi) { return ipv6_addr_equal(&ip1->ip.in6, &ip2->ip.in6) && ipv6_addr_equal(&ip1->ip2.in6, &ip2->ip2.in6) && ip1->port == ip2->port && ip1->proto == ip2->proto; } static bool hash_ipportip6_data_list(struct sk_buff *skb, const struct hash_ipportip6_elem *data) { if (nla_put_ipaddr6(skb, IPSET_ATTR_IP, &data->ip.in6) || nla_put_ipaddr6(skb, IPSET_ATTR_IP2, &data->ip2.in6) || nla_put_net16(skb, IPSET_ATTR_PORT, data->port) || nla_put_u8(skb, IPSET_ATTR_PROTO, data->proto)) goto nla_put_failure; return false; nla_put_failure: return true; } static inline void hash_ipportip6_data_next(struct hash_ipportip6_elem *next, const struct hash_ipportip6_elem *d) { next->port = d->port; } #undef MTYPE #undef HOST_MASK #define MTYPE hash_ipportip6 #define HOST_MASK 128 #define IP_SET_EMIT_CREATE #include "ip_set_hash_gen.h" static int hash_ipportip6_kadt(struct ip_set *set, const struct sk_buff *skb, const struct xt_action_param *par, enum ipset_adt adt, struct ip_set_adt_opt *opt) { ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set); if (!ip_set_get_ip6_port(skb, opt->flags & IPSET_DIM_TWO_SRC, &e.port, &e.proto)) return -EINVAL; ip6addrptr(skb, opt->flags & IPSET_DIM_ONE_SRC, &e.ip.in6); ip6addrptr(skb, opt->flags & IPSET_DIM_THREE_SRC, &e.ip2.in6); return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags); } static int hash_ipportip6_uadt(struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 *lineno, u32 flags, bool retried) { const struct hash_ipportip6 *h = set->data; ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_ipportip6_elem e = { .ip = { .all = { 0 } } }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 port, port_to; bool with_ports = false; int ret; if (tb[IPSET_ATTR_LINENO]) *lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]); if (unlikely(!tb[IPSET_ATTR_IP] || !tb[IPSET_ATTR_IP2] || !ip_set_attr_netorder(tb, IPSET_ATTR_PORT) || !ip_set_optattr_netorder(tb, IPSET_ATTR_PORT_TO))) return -IPSET_ERR_PROTOCOL; if (unlikely(tb[IPSET_ATTR_IP_TO])) return -IPSET_ERR_HASH_RANGE_UNSUPPORTED; if (unlikely(tb[IPSET_ATTR_CIDR])) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); if (cidr != HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP], &e.ip); if (ret) return ret; ret = ip_set_get_extensions(set, tb, &ext); if (ret) return ret; ret = ip_set_get_ipaddr6(tb[IPSET_ATTR_IP2], &e.ip2); if (ret) return ret; e.port = nla_get_be16(tb[IPSET_ATTR_PORT]); if (tb[IPSET_ATTR_PROTO]) { e.proto = nla_get_u8(tb[IPSET_ATTR_PROTO]); with_ports = ip_set_proto_with_ports(e.proto); if (e.proto == 0) return -IPSET_ERR_INVALID_PROTO; } else { return -IPSET_ERR_MISSING_PROTO; } if (!(with_ports || e.proto == IPPROTO_ICMPV6)) e.port = 0; if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) { ret = adtfn(set, &e, &ext, &ext, flags); return ip_set_eexist(ret, flags) ? 0 : ret; } port = ntohs(e.port); port_to = ip_set_get_h16(tb[IPSET_ATTR_PORT_TO]); if (port > port_to) swap(port, port_to); if (retried) port = ntohs(h->next.port); for (; port <= port_to; port++) { e.port = htons(port); ret = adtfn(set, &e, &ext, &ext, flags); if (ret && !ip_set_eexist(ret, flags)) return ret; ret = 0; } return ret; } static struct ip_set_type hash_ipportip_type __read_mostly = { .name = "hash:ip,port,ip", .protocol = IPSET_PROTOCOL, .features = IPSET_TYPE_IP | IPSET_TYPE_PORT | IPSET_TYPE_IP2, .dimension = IPSET_DIM_THREE, .family = NFPROTO_UNSPEC, .revision_min = IPSET_TYPE_REV_MIN, .revision_max = IPSET_TYPE_REV_MAX, .create = hash_ipportip_create, .create_policy = { [IPSET_ATTR_HASHSIZE] = { .type = NLA_U32 }, [IPSET_ATTR_MAXELEM] = { .type = NLA_U32 }, [IPSET_ATTR_PROBES] = { .type = NLA_U8 }, [IPSET_ATTR_RESIZE] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_CADT_FLAGS] = { .type = NLA_U32 }, }, .adt_policy = { [IPSET_ATTR_IP] = { .type = NLA_NESTED }, [IPSET_ATTR_IP_TO] = { .type = NLA_NESTED }, [IPSET_ATTR_IP2] = { .type = NLA_NESTED }, [IPSET_ATTR_PORT] = { .type = NLA_U16 }, [IPSET_ATTR_PORT_TO] = { .type = NLA_U16 }, [IPSET_ATTR_CIDR] = { .type = NLA_U8 }, [IPSET_ATTR_PROTO] = { .type = NLA_U8 }, [IPSET_ATTR_TIMEOUT] = { .type = NLA_U32 }, [IPSET_ATTR_LINENO] = { .type = NLA_U32 }, [IPSET_ATTR_BYTES] = { .type = NLA_U64 }, [IPSET_ATTR_PACKETS] = { .type = NLA_U64 }, [IPSET_ATTR_COMMENT] = { .type = NLA_NUL_STRING, .len = IPSET_MAX_COMMENT_SIZE }, [IPSET_ATTR_SKBMARK] = { .type = NLA_U64 }, [IPSET_ATTR_SKBPRIO] = { .type = NLA_U32 }, [IPSET_ATTR_SKBQUEUE] = { .type = NLA_U16 }, }, .me = THIS_MODULE, }; static int __init hash_ipportip_init(void) { return ip_set_type_register(&hash_ipportip_type); } static void __exit hash_ipportip_fini(void) { rcu_barrier(); ip_set_type_unregister(&hash_ipportip_type); } module_init(hash_ipportip_init); module_exit(hash_ipportip_fini);
130 131 131 62 2 78 129 57 57 57 56 54 3 54 54 1 53 52 54 54 54 4 54 54 36 36 53 44 44 18 42 39 11 10 11 57 1 1 47 47 3 3 46 46 13 13 1 1 12 12 3 11 11 9 6 6 5 6 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 /* * linux/fs/hfs/dir.c * * Copyright (C) 1995-1997 Paul H. Hargrove * (C) 2003 Ardis Technologies <roman@ardistech.com> * This file may be distributed under the terms of the GNU General Public License. * * This file contains directory-related functions independent of which * scheme is being used to represent forks. * * Based on the minix file system code, (C) 1991, 1992 by Linus Torvalds */ #include "hfs_fs.h" #include "btree.h" /* * hfs_lookup() */ static struct dentry *hfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { hfs_cat_rec rec; struct hfs_find_data fd; struct inode *inode = NULL; int res; res = hfs_find_init(HFS_SB(dir->i_sb)->cat_tree, &fd); if (res) return ERR_PTR(res); hfs_cat_build_key(dir->i_sb, fd.search_key, dir->i_ino, &dentry->d_name); res = hfs_brec_read(&fd, &rec, sizeof(rec)); if (res) { hfs_find_exit(&fd); if (res == -ENOENT) { /* No such entry */ inode = NULL; goto done; } return ERR_PTR(res); } inode = hfs_iget(dir->i_sb, &fd.search_key->cat, &rec); hfs_find_exit(&fd); if (!inode) return ERR_PTR(-EACCES); done: d_add(dentry, inode); return NULL; } /* * hfs_readdir */ static int hfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; int len, err; char strbuf[HFS_MAX_NAMELEN]; union hfs_cat_rec entry; struct hfs_find_data fd; struct hfs_readdir_data *rd; u16 type; if (ctx->pos >= inode->i_size) return 0; err = hfs_find_init(HFS_SB(sb)->cat_tree, &fd); if (err) return err; hfs_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); err = hfs_brec_find(&fd); if (err) goto out; if (ctx->pos == 0) { /* This is completely artificial... */ if (!dir_emit_dot(file, ctx)) goto out; ctx->pos = 1; } if (ctx->pos == 1) { if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); if (entry.type != HFS_CDR_THD) { pr_err("bad catalog folder thread\n"); err = -EIO; goto out; } //if (fd.entrylength < HFS_MIN_THREAD_SZ) { // pr_err("truncated catalog thread\n"); // err = -EIO; // goto out; //} if (!dir_emit(ctx, "..", 2, be32_to_cpu(entry.thread.ParID), DT_DIR)) goto out; ctx->pos = 2; } if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, ctx->pos - 1); if (err) goto out; for (;;) { if (be32_to_cpu(fd.key->cat.ParID) != inode->i_ino) { pr_err("walked past end of dir\n"); err = -EIO; goto out; } if (fd.entrylength > sizeof(entry) || fd.entrylength < 0) { err = -EIO; goto out; } hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, fd.entrylength); type = entry.type; len = hfs_mac2asc(sb, strbuf, &fd.key->cat.CName); if (type == HFS_CDR_DIR) { if (fd.entrylength < sizeof(struct hfs_cat_dir)) { pr_err("small dir entry\n"); err = -EIO; goto out; } if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.dir.DirID), DT_DIR)) break; } else if (type == HFS_CDR_FIL) { if (fd.entrylength < sizeof(struct hfs_cat_file)) { pr_err("small file entry\n"); err = -EIO; goto out; } if (!dir_emit(ctx, strbuf, len, be32_to_cpu(entry.file.FlNum), DT_REG)) break; } else { pr_err("bad catalog entry type %d\n", type); err = -EIO; goto out; } ctx->pos++; if (ctx->pos >= inode->i_size) goto out; err = hfs_brec_goto(&fd, 1); if (err) goto out; } rd = file->private_data; if (!rd) { rd = kmalloc(sizeof(struct hfs_readdir_data), GFP_KERNEL); if (!rd) { err = -ENOMEM; goto out; } file->private_data = rd; rd->file = file; spin_lock(&HFS_I(inode)->open_dir_lock); list_add(&rd->list, &HFS_I(inode)->open_dir_list); spin_unlock(&HFS_I(inode)->open_dir_lock); } /* * Can be done after the list insertion; exclusion with * hfs_delete_cat() is provided by directory lock. */ memcpy(&rd->key, &fd.key->cat, sizeof(struct hfs_cat_key)); out: hfs_find_exit(&fd); return err; } static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { spin_lock(&HFS_I(inode)->open_dir_lock); list_del(&rd->list); spin_unlock(&HFS_I(inode)->open_dir_lock); kfree(rd); } return 0; } /* * hfs_create() * * This is the create() entry in the inode_operations structure for * regular HFS directories. The purpose is to create a new file in * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, mode); if (!inode) return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; } d_instantiate(dentry, inode); mark_inode_dirty(inode); return 0; } /* * hfs_mkdir() * * This is the mkdir() entry in the inode_operations structure for * regular HFS directories. The purpose is to create a new directory * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); if (!inode) return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); return res; } d_instantiate(dentry, inode); mark_inode_dirty(inode); return 0; } /* * hfs_remove() * * This serves as both unlink() and rmdir() in the inode_operations * structure for regular HFS directories. The purpose is to delete * an existing child, given the inode for the parent directory and * the name (and its length) of the existing directory. * * HFS does not have hardlinks, so both rmdir and unlink set the * link count to 0. The only difference is the emptiness check. */ static int hfs_remove(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int res; if (S_ISDIR(inode->i_mode) && inode->i_size != 2) return -ENOTEMPTY; res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name); if (res) return res; clear_nlink(inode); inode->i_ctime = current_time(inode); hfs_delete_inode(inode); mark_inode_dirty(inode); return 0; } /* * hfs_rename() * * This is the rename() entry in the inode_operations structure for * regular HFS directories. The purpose is to rename an existing * file or directory, given the inode for the current directory and * the name (and its length) of the existing file/directory and the * inode for the new directory and the name (and its length) of the * new file/directory. * XXX: how do you handle must_be dir? */ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int res; if (flags & ~RENAME_NOREPLACE) return -EINVAL; /* Unlink destination if it already exists */ if (d_really_is_positive(new_dentry)) { res = hfs_remove(new_dir, new_dentry); if (res) return res; } res = hfs_cat_move(d_inode(old_dentry)->i_ino, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); if (!res) hfs_cat_build_key(old_dir->i_sb, (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key, new_dir->i_ino, &new_dentry->d_name); return res; } const struct file_operations hfs_dir_operations = { .read = generic_read_dir, .iterate_shared = hfs_readdir, .llseek = generic_file_llseek, .release = hfs_dir_release, }; const struct inode_operations hfs_dir_inode_operations = { .create = hfs_create, .lookup = hfs_lookup, .unlink = hfs_remove, .mkdir = hfs_mkdir, .rmdir = hfs_remove, .rename = hfs_rename, .setattr = hfs_inode_setattr, };
299 338 299 26 273 39 26 9 3742 1778 1836 1826 1616 368 152 1778 1427 6 106 520 80 17 3 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ADDRCONF_H #define _ADDRCONF_H #define MAX_RTR_SOLICITATIONS -1 /* unlimited */ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ #define MIN_VALID_LIFETIME (2*3600) /* 2 hours */ #define TEMP_VALID_LIFETIME (7*86400) #define TEMP_PREFERRED_LIFETIME (86400) #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) #define ADDR_CHECK_FREQUENCY (120*HZ) #define IPV6_MAX_ADDRESSES 16 #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ / 50 : 1) #define ADDRCONF_TIMER_FUZZ (HZ / 4) #define ADDRCONF_TIMER_FUZZ_MAX (HZ) #define ADDRCONF_NOTIFY_PRIORITY 0 #include <linux/in.h> #include <linux/in6.h> struct prefix_info { __u8 type; __u8 length; __u8 prefix_len; #if defined(__BIG_ENDIAN_BITFIELD) __u8 onlink : 1, autoconf : 1, reserved : 6; #elif defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved : 6, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif __be32 valid; __be32 prefered; __be32 reserved2; struct in6_addr prefix; }; #include <linux/netdevice.h> #include <net/if_inet6.h> #include <net/ipv6.h> struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; }; #define IN6_ADDR_HSIZE_SHIFT 4 #define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) int addrconf_init(void); void addrconf_cleanup(void); int addrconf_add_ifaddr(struct net *net, void __user *arg); int addrconf_del_ifaddr(struct net *net, void __user *arg); int addrconf_set_dstaddr(struct net *net, void __user *arg); int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict, u32 banned_flags); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); #endif bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict); int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); int inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr, u32 flags); int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, const struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft); static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr) { memcpy(eui, addr, 3); eui[3] = 0xFF; eui[4] = 0xFE; memcpy(eui + 5, addr + 3, 3); } static inline void addrconf_addr_eui48(u8 *eui, const char *const addr) { addrconf_addr_eui48_base(eui, addr); eui[0] ^= 2; } static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) return -1; /* * The zSeries OSA network cards can be shared among various * OS instances, but the OSA cards have only one MAC address. * This leads to duplicate address conflicts in conjunction * with IPv6 if more than one instance uses the same card. * * The driver for these cards can deliver a unique 16-bit * identifier for each instance sharing the same card. It is * placed instead of 0xFFFE in the interface identifier. The * "u" bit of the interface identifier is not inverted in this * case. Hence the resulting interface identifier has local * scope according to RFC2373. */ addrconf_addr_eui48_base(eui, dev->dev_addr); if (dev->dev_id) { eui[3] = (dev->dev_id >> 8) & 0xFF; eui[4] = dev->dev_id & 0xFF; } else { eui[0] ^= 2; } return 0; } static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) { if (timeout == 0xffffffff) return ~0UL; /* * Avoid arithmetic overflow. * Assuming unit is constant and non-zero, this "if" statement * will go away on 64bit archs. */ if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit) return LONG_MAX / unit; return timeout; } static inline int addrconf_finite_timeout(unsigned long timeout) { return ~timeout; } /* * IPv6 Address Label subsystem (addrlabel.c) */ int ipv6_addr_label_init(void); void ipv6_addr_label_cleanup(void); void ipv6_addr_label_rtnl_register(void); u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex); /* * multicast prototypes (mcast.c) */ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_mc_close(struct sock *sk); void ipv6_sock_mc_close(struct sock *sk); bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr); int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr); void ipv6_mc_up(struct inet6_dev *idev); void ipv6_mc_down(struct inet6_dev *idev); void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed); void addrconf_dad_failure(struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); void ipv6_mc_dad_complete(struct inet6_dev *idev); /* A stub used by vxlan module. This is ugly, ideally these * symbols should be built into the core kernel. */ struct ipv6_stub { int (*ipv6_sock_mc_join)(struct sock *sk, int ifindex, const struct in6_addr *addr); int (*ipv6_sock_mc_drop)(struct sock *sk, int ifindex, const struct in6_addr *addr); struct dst_entry *(*ipv6_dst_lookup_flow)(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); void (*udpv6_encap_enable)(void); void (*ndisc_send_na)(struct net_device *dev, const struct in6_addr *daddr, const struct in6_addr *solicited_addr, bool router, bool solicited, bool override, bool inc_opt); struct neigh_table *nd_tbl; }; extern const struct ipv6_stub *ipv6_stub __read_mostly; /* * identify MLD packets for MLD filter exceptions */ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset) { struct icmp6hdr *hdr; if (nexthdr != IPPROTO_ICMPV6 || !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr))) return false; hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (hdr->icmp6_type) { case ICMPV6_MGM_QUERY: case ICMPV6_MGM_REPORT: case ICMPV6_MGM_REDUCTION: case ICMPV6_MLD2_REPORT: return true; default: break; } return false; } void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao); /* * anycast prototypes (anycast.c) */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_ac_close(struct sock *sk); void ipv6_sock_ac_close(struct sock *sk); int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr); int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); void ipv6_ac_destroy_dev(struct inet6_dev *idev); bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr); /* Device notifier */ int register_inet6addr_notifier(struct notifier_block *nb); int unregister_inet6addr_notifier(struct notifier_block *nb); int inet6addr_notifier_call_chain(unsigned long val, void *v); int register_inet6addr_validator_notifier(struct notifier_block *nb); int unregister_inet6addr_validator_notifier(struct notifier_block *nb); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v); void inet6_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv6_devconf *devconf); /** * __in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) { return rcu_dereference_rtnl(dev->ip6_ptr); } /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * This version can be used in any context, and takes a reference * on the inet6_dev. Callers must use in6_dev_put() later to * release this reference. */ static inline struct inet6_dev *in6_dev_get(const struct net_device *dev) { struct inet6_dev *idev; rcu_read_lock(); idev = rcu_dereference(dev->ip6_ptr); if (idev) refcount_inc(&idev->refcnt); rcu_read_unlock(); return idev; } static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev) { struct inet6_dev *idev = __in6_dev_get(dev); return idev ? idev->nd_parms : NULL; } void in6_dev_finish_destroy(struct inet6_dev *idev); static inline void in6_dev_put(struct inet6_dev *idev) { if (refcount_dec_and_test(&idev->refcnt)) in6_dev_finish_destroy(idev); } static inline void in6_dev_put_clear(struct inet6_dev **pidev) { struct inet6_dev *idev = *pidev; if (idev) { in6_dev_put(idev); *pidev = NULL; } } static inline void __in6_dev_put(struct inet6_dev *idev) { refcount_dec(&idev->refcnt); } static inline void in6_dev_hold(struct inet6_dev *idev) { refcount_inc(&idev->refcnt); } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) { if (refcount_dec_and_test(&ifp->refcnt)) inet6_ifa_finish_destroy(ifp); } static inline void __in6_ifa_put(struct inet6_ifaddr *ifp) { refcount_dec(&ifp->refcnt); } static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) { refcount_inc(&ifp->refcnt); } /* * compute link-local solicited-node multicast address */ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, struct in6_addr *solicited) { ipv6_addr_set(solicited, htonl(0xFF020000), 0, htonl(0x1), htonl(0xFF000000) | addr->s6_addr32[3]); } static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; #endif } static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; #endif } static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) { return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); } static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) & cpu_to_be64(0xffffffffff000000UL))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x00000001)) | (addr->s6_addr[12] ^ 0xff)) == 0; #endif } #ifdef CONFIG_PROC_FS int if6_proc_init(void); void if6_proc_exit(void); #endif #endif
46 45 46 6 6 48 48 84 83 82 84 3 83 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 /* * OSS compatible sequencer driver * * registration of device and proc * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/compat.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/initval.h> #include "seq_oss_device.h" #include "seq_oss_synth.h" /* * module option */ MODULE_AUTHOR("Takashi Iwai <tiwai@suse.de>"); MODULE_DESCRIPTION("OSS-compatible sequencer module"); MODULE_LICENSE("GPL"); /* Takashi says this is really only for sound-service-0-, but this is OK. */ MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_SEQUENCER); MODULE_ALIAS_SNDRV_MINOR(SNDRV_MINOR_OSS_MUSIC); /* * prototypes */ static int register_device(void); static void unregister_device(void); #ifdef CONFIG_SND_PROC_FS static int register_proc(void); static void unregister_proc(void); #else static inline int register_proc(void) { return 0; } static inline void unregister_proc(void) {} #endif static int odev_open(struct inode *inode, struct file *file); static int odev_release(struct inode *inode, struct file *file); static ssize_t odev_read(struct file *file, char __user *buf, size_t count, loff_t *offset); static ssize_t odev_write(struct file *file, const char __user *buf, size_t count, loff_t *offset); static long odev_ioctl(struct file *file, unsigned int cmd, unsigned long arg); static unsigned int odev_poll(struct file *file, poll_table * wait); /* * module interface */ static struct snd_seq_driver seq_oss_synth_driver = { .driver = { .name = KBUILD_MODNAME, .probe = snd_seq_oss_synth_probe, .remove = snd_seq_oss_synth_remove, }, .id = SNDRV_SEQ_DEV_ID_OSS, .argsize = sizeof(struct snd_seq_oss_reg), }; static int __init alsa_seq_oss_init(void) { int rc; if ((rc = register_device()) < 0) goto error; if ((rc = register_proc()) < 0) { unregister_device(); goto error; } if ((rc = snd_seq_oss_create_client()) < 0) { unregister_proc(); unregister_device(); goto error; } rc = snd_seq_driver_register(&seq_oss_synth_driver); if (rc < 0) { snd_seq_oss_delete_client(); unregister_proc(); unregister_device(); goto error; } /* success */ snd_seq_oss_synth_init(); error: return rc; } static void __exit alsa_seq_oss_exit(void) { snd_seq_driver_unregister(&seq_oss_synth_driver); snd_seq_oss_delete_client(); unregister_proc(); unregister_device(); } module_init(alsa_seq_oss_init) module_exit(alsa_seq_oss_exit) /* * ALSA minor device interface */ static DEFINE_MUTEX(register_mutex); static int odev_open(struct inode *inode, struct file *file) { int level, rc; if (iminor(inode) == SNDRV_MINOR_OSS_MUSIC) level = SNDRV_SEQ_OSS_MODE_MUSIC; else level = SNDRV_SEQ_OSS_MODE_SYNTH; mutex_lock(&register_mutex); rc = snd_seq_oss_open(file, level); mutex_unlock(&register_mutex); return rc; } static int odev_release(struct inode *inode, struct file *file) { struct seq_oss_devinfo *dp; if ((dp = file->private_data) == NULL) return 0; mutex_lock(&register_mutex); snd_seq_oss_release(dp); mutex_unlock(&register_mutex); return 0; } static ssize_t odev_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; return snd_seq_oss_read(dp, buf, count); } static ssize_t odev_write(struct file *file, const char __user *buf, size_t count, loff_t *offset) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; return snd_seq_oss_write(dp, buf, count, file); } static long odev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct seq_oss_devinfo *dp; long rc; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; if (cmd != SNDCTL_SEQ_SYNC && mutex_lock_interruptible(&register_mutex)) return -ERESTARTSYS; rc = snd_seq_oss_ioctl(dp, cmd, arg); if (cmd != SNDCTL_SEQ_SYNC) mutex_unlock(&register_mutex); return rc; } #ifdef CONFIG_COMPAT static long odev_ioctl_compat(struct file *file, unsigned int cmd, unsigned long arg) { return odev_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } #else #define odev_ioctl_compat NULL #endif static unsigned int odev_poll(struct file *file, poll_table * wait) { struct seq_oss_devinfo *dp; dp = file->private_data; if (snd_BUG_ON(!dp)) return -ENXIO; return snd_seq_oss_poll(dp, file, wait); } /* * registration of sequencer minor device */ static const struct file_operations seq_oss_f_ops = { .owner = THIS_MODULE, .read = odev_read, .write = odev_write, .open = odev_open, .release = odev_release, .poll = odev_poll, .unlocked_ioctl = odev_ioctl, .compat_ioctl = odev_ioctl_compat, .llseek = noop_llseek, }; static int __init register_device(void) { int rc; mutex_lock(&register_mutex); if ((rc = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0, &seq_oss_f_ops, NULL)) < 0) { pr_err("ALSA: seq_oss: can't register device seq\n"); mutex_unlock(&register_mutex); return rc; } if ((rc = snd_register_oss_device(SNDRV_OSS_DEVICE_TYPE_MUSIC, NULL, 0, &seq_oss_f_ops, NULL)) < 0) { pr_err("ALSA: seq_oss: can't register device music\n"); snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0); mutex_unlock(&register_mutex); return rc; } mutex_unlock(&register_mutex); return 0; } static void unregister_device(void) { mutex_lock(&register_mutex); if (snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_MUSIC, NULL, 0) < 0) pr_err("ALSA: seq_oss: error unregister device music\n"); if (snd_unregister_oss_device(SNDRV_OSS_DEVICE_TYPE_SEQUENCER, NULL, 0) < 0) pr_err("ALSA: seq_oss: error unregister device seq\n"); mutex_unlock(&register_mutex); } /* * /proc interface */ #ifdef CONFIG_SND_PROC_FS static struct snd_info_entry *info_entry; static void info_read(struct snd_info_entry *entry, struct snd_info_buffer *buf) { mutex_lock(&register_mutex); snd_iprintf(buf, "OSS sequencer emulation version %s\n", SNDRV_SEQ_OSS_VERSION_STR); snd_seq_oss_system_info_read(buf); snd_seq_oss_synth_info_read(buf); snd_seq_oss_midi_info_read(buf); mutex_unlock(&register_mutex); } static int __init register_proc(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, SNDRV_SEQ_OSS_PROCNAME, snd_seq_root); if (entry == NULL) return -ENOMEM; entry->content = SNDRV_INFO_CONTENT_TEXT; entry->private_data = NULL; entry->c.text.read = info_read; if (snd_info_register(entry) < 0) { snd_info_free_entry(entry); return -ENOMEM; } info_entry = entry; return 0; } static void unregister_proc(void) { snd_info_free_entry(info_entry); info_entry = NULL; } #endif /* CONFIG_SND_PROC_FS */
31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_STRING_HELPERS_H_ #define _LINUX_STRING_HELPERS_H_ #include <linux/types.h> struct file; /* Descriptions of the types of units to * print in */ enum string_size_units { STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ STRING_UNITS_2, /* use binary powers of 2^10 */ }; void string_get_size(u64 size, u64 blk_size, enum string_size_units units, char *buf, int len); #define UNESCAPE_SPACE 0x01 #define UNESCAPE_OCTAL 0x02 #define UNESCAPE_HEX 0x04 #define UNESCAPE_SPECIAL 0x08 #define UNESCAPE_ANY \ (UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL) int string_unescape(char *src, char *dst, size_t size, unsigned int flags); static inline int string_unescape_inplace(char *buf, unsigned int flags) { return string_unescape(buf, buf, 0, flags); } static inline int string_unescape_any(char *src, char *dst, size_t size) { return string_unescape(src, dst, size, UNESCAPE_ANY); } static inline int string_unescape_any_inplace(char *buf) { return string_unescape_any(buf, buf, 0); } #define ESCAPE_SPACE 0x01 #define ESCAPE_SPECIAL 0x02 #define ESCAPE_NULL 0x04 #define ESCAPE_OCTAL 0x08 #define ESCAPE_ANY \ (ESCAPE_SPACE | ESCAPE_OCTAL | ESCAPE_SPECIAL | ESCAPE_NULL) #define ESCAPE_NP 0x10 #define ESCAPE_ANY_NP (ESCAPE_ANY | ESCAPE_NP) #define ESCAPE_HEX 0x20 int string_escape_mem(const char *src, size_t isz, char *dst, size_t osz, unsigned int flags, const char *only); static inline int string_escape_mem_any_np(const char *src, size_t isz, char *dst, size_t osz, const char *only) { return string_escape_mem(src, isz, dst, osz, ESCAPE_ANY_NP, only); } static inline int string_escape_str(const char *src, char *dst, size_t sz, unsigned int flags, const char *only) { return string_escape_mem(src, strlen(src), dst, sz, flags, only); } static inline int string_escape_str_any_np(const char *src, char *dst, size_t sz, const char *only) { return string_escape_str(src, dst, sz, ESCAPE_ANY_NP, only); } char *kstrdup_quotable(const char *src, gfp_t gfp); char *kstrdup_quotable_cmdline(struct task_struct *task, gfp_t gfp); char *kstrdup_quotable_file(struct file *file, gfp_t gfp); #endif
360 361 360 353 352 581 577 581 580 17 208 87 87 86 87 87 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 /* * Support KVM gust page tracking * * This feature allows us to track page access in guest. Currently, only * write access is tracked. * * Copyright(C) 2015 Intel Corporation. * * Author: * Xiao Guangrong <guangrong.xiao@linux.intel.com> * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. */ #include <linux/kvm_host.h> #include <linux/rculist.h> #include <asm/kvm_host.h> #include <asm/kvm_page_track.h> #include "mmu.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *free, struct kvm_memory_slot *dont) { int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) if (!dont || free->arch.gfn_track[i] != dont->arch.gfn_track[i]) { kvfree(free->arch.gfn_track[i]); free->arch.gfn_track[i] = NULL; } } int kvm_page_track_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) { int i; for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) { slot->arch.gfn_track[i] = kvzalloc(npages * sizeof(*slot->arch.gfn_track[i]), GFP_KERNEL); if (!slot->arch.gfn_track[i]) goto track_free; } return 0; track_free: kvm_page_track_free_memslot(slot, NULL); return -ENOMEM; } static inline bool page_track_mode_is_valid(enum kvm_page_track_mode mode) { if (mode < 0 || mode >= KVM_PAGE_TRACK_MAX) return false; return true; } static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode, short count) { int index, val; index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); val = slot->arch.gfn_track[mode][index]; if (WARN_ON(val + count < 0 || val + count > USHRT_MAX)) return; slot->arch.gfn_track[mode][index] += count; } /* * add guest page to the tracking pool so that corresponding access on that * page will be intercepted. * * It should be called under the protection both of mmu-lock and kvm->srcu * or kvm->slots_lock. * * @kvm: the guest instance we are interested in. * @slot: the @gfn belongs to. * @gfn: the guest page. * @mode: tracking mode, currently only write track is supported. */ void kvm_slot_page_track_add_page(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode) { if (WARN_ON(!page_track_mode_is_valid(mode))) return; update_gfn_track(slot, gfn, mode, 1); /* * new track stops large page mapping for the * tracked page. */ kvm_mmu_gfn_disallow_lpage(slot, gfn); if (mode == KVM_PAGE_TRACK_WRITE) if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); } EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); /* * remove the guest page from the tracking pool which stops the interception * of corresponding access on that page. It is the opposed operation of * kvm_slot_page_track_add_page(). * * It should be called under the protection both of mmu-lock and kvm->srcu * or kvm->slots_lock. * * @kvm: the guest instance we are interested in. * @slot: the @gfn belongs to. * @gfn: the guest page. * @mode: tracking mode, currently only write track is supported. */ void kvm_slot_page_track_remove_page(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, enum kvm_page_track_mode mode) { if (WARN_ON(!page_track_mode_is_valid(mode))) return; update_gfn_track(slot, gfn, mode, -1); /* * allow large page mapping for the tracked page * after the tracker is gone. */ kvm_mmu_gfn_allow_lpage(slot, gfn); } EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. */ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, enum kvm_page_track_mode mode) { struct kvm_memory_slot *slot; int index; if (WARN_ON(!page_track_mode_is_valid(mode))) return false; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); if (!slot) return false; index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); return !!ACCESS_ONCE(slot->arch.gfn_track[mode][index]); } void kvm_page_track_cleanup(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; cleanup_srcu_struct(&head->track_srcu); } void kvm_page_track_init(struct kvm *kvm) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; init_srcu_struct(&head->track_srcu); INIT_HLIST_HEAD(&head->track_notifier_list); } /* * register the notifier so that event interception for the tracked guest * pages can be received. */ void kvm_page_track_register_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; spin_lock(&kvm->mmu_lock); hlist_add_head_rcu(&n->node, &head->track_notifier_list); spin_unlock(&kvm->mmu_lock); } EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); /* * stop receiving the event interception. It is the opposed operation of * kvm_page_track_register_notifier(). */ void kvm_page_track_unregister_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n) { struct kvm_page_track_notifier_head *head; head = &kvm->arch.track_notifier_head; spin_lock(&kvm->mmu_lock); hlist_del_rcu(&n->node); spin_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); /* * Notify the node that write access is intercepted and write emulation is * finished at this time. * * The node should figure out if the written page is the one that node is * interested in by itself. */ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; int idx; head = &vcpu->kvm->arch.track_notifier_head; if (hlist_empty(&head->track_notifier_list)) return; idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) n->track_write(vcpu, gpa, new, bytes, n); srcu_read_unlock(&head->track_srcu, idx); } /* * Notify the node that memory slot is being removed or moved so that it can * drop write-protection for the pages in the memory slot. * * The node should figure out it has any write-protected pages in this slot * by itself. */ void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) { struct kvm_page_track_notifier_head *head; struct kvm_page_track_notifier_node *n; int idx; head = &kvm->arch.track_notifier_head; if (hlist_empty(&head->track_notifier_list)) return; idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_flush_slot) n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); }
2 2 2 1 1 1 2 2 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 // SPDX-License-Identifier: GPL-2.0 /* * fs/bfs/file.c * BFS file operations. * Copyright (C) 1999,2000 Tigran Aivazian <tigran@veritas.com> * * Make the file block allocation algorithm understand the size * of the underlying block device. * Copyright (C) 2007 Dmitri Vorobiev <dmitri.vorobiev@gmail.com> * */ #include <linux/fs.h> #include <linux/buffer_head.h> #include "bfs.h" #undef DEBUG #ifdef DEBUG #define dprintf(x...) printf(x) #else #define dprintf(x...) #endif const struct file_operations bfs_file_operations = { .llseek = generic_file_llseek, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, }; static int bfs_move_block(unsigned long from, unsigned long to, struct super_block *sb) { struct buffer_head *bh, *new; bh = sb_bread(sb, from); if (!bh) return -EIO; new = sb_getblk(sb, to); memcpy(new->b_data, bh->b_data, bh->b_size); mark_buffer_dirty(new); bforget(bh); brelse(new); return 0; } static int bfs_move_blocks(struct super_block *sb, unsigned long start, unsigned long end, unsigned long where) { unsigned long i; dprintf("%08lx-%08lx->%08lx\n", start, end, where); for (i = start; i <= end; i++) if(bfs_move_block(i, where + i, sb)) { dprintf("failed to move block %08lx -> %08lx\n", i, where + i); return -EIO; } return 0; } static int bfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_result, int create) { unsigned long phys; int err; struct super_block *sb = inode->i_sb; struct bfs_sb_info *info = BFS_SB(sb); struct bfs_inode_info *bi = BFS_I(inode); phys = bi->i_sblock + block; if (!create) { if (phys <= bi->i_eblock) { dprintf("c=%d, b=%08lx, phys=%09lx (granted)\n", create, (unsigned long)block, phys); map_bh(bh_result, sb, phys); } return 0; } /* * If the file is not empty and the requested block is within the * range of blocks allocated for this file, we can grant it. */ if (bi->i_sblock && (phys <= bi->i_eblock)) { dprintf("c=%d, b=%08lx, phys=%08lx (interim block granted)\n", create, (unsigned long)block, phys); map_bh(bh_result, sb, phys); return 0; } /* The file will be extended, so let's see if there is enough space. */ if (phys >= info->si_blocks) return -ENOSPC; /* The rest has to be protected against itself. */ mutex_lock(&info->bfs_lock); /* * If the last data block for this file is the last allocated * block, we can extend the file trivially, without moving it * anywhere. */ if (bi->i_eblock == info->si_lf_eblk) { dprintf("c=%d, b=%08lx, phys=%08lx (simple extension)\n", create, (unsigned long)block, phys); map_bh(bh_result, sb, phys); info->si_freeb -= phys - bi->i_eblock; info->si_lf_eblk = bi->i_eblock = phys; mark_inode_dirty(inode); err = 0; goto out; } /* Ok, we have to move this entire file to the next free block. */ phys = info->si_lf_eblk + 1; if (phys + block >= info->si_blocks) { err = -ENOSPC; goto out; } if (bi->i_sblock) { err = bfs_move_blocks(inode->i_sb, bi->i_sblock, bi->i_eblock, phys); if (err) { dprintf("failed to move ino=%08lx -> fs corruption\n", inode->i_ino); goto out; } } else err = 0; dprintf("c=%d, b=%08lx, phys=%08lx (moved)\n", create, (unsigned long)block, phys); bi->i_sblock = phys; phys += block; info->si_lf_eblk = bi->i_eblock = phys; /* * This assumes nothing can write the inode back while we are here * and thus update inode->i_blocks! (XXX) */ info->si_freeb -= bi->i_eblock - bi->i_sblock + 1 - inode->i_blocks; mark_inode_dirty(inode); map_bh(bh_result, sb, phys); out: mutex_unlock(&info->bfs_lock); return err; } static int bfs_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, bfs_get_block, wbc); } static int bfs_readpage(struct file *file, struct page *page) { return block_read_full_page(page, bfs_get_block); } static void bfs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; if (to > inode->i_size) truncate_pagecache(inode, inode->i_size); } static int bfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { int ret; ret = block_write_begin(mapping, pos, len, flags, pagep, bfs_get_block); if (unlikely(ret)) bfs_write_failed(mapping, pos + len); return ret; } static sector_t bfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, bfs_get_block); } const struct address_space_operations bfs_aops = { .readpage = bfs_readpage, .writepage = bfs_writepage, .write_begin = bfs_write_begin, .write_end = generic_write_end, .bmap = bfs_bmap, }; const struct inode_operations bfs_file_inops;
20 11465 4560 13 159 159 21 5114 372 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_LIST_BL_H #define _LINUX_LIST_BL_H #include <linux/list.h> #include <linux/bit_spinlock.h> /* * Special version of lists, where head of the list has a lock in the lowest * bit. This is useful for scalable hash tables without increasing memory * footprint overhead. * * For modification operations, the 0 bit of hlist_bl_head->first * pointer must be set. * * With some small modifications, this can easily be adapted to store several * arbitrary bits (not just a single lock bit), if the need arises to store * some fast and compact auxiliary data. */ #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) #define LIST_BL_LOCKMASK 1UL #else #define LIST_BL_LOCKMASK 0UL #endif #ifdef CONFIG_DEBUG_LIST #define LIST_BL_BUG_ON(x) BUG_ON(x) #else #define LIST_BL_BUG_ON(x) #endif struct hlist_bl_head { struct hlist_bl_node *first; }; struct hlist_bl_node { struct hlist_bl_node *next, **pprev; }; #define INIT_HLIST_BL_HEAD(ptr) \ ((ptr)->first = NULL) static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) { h->next = NULL; h->pprev = NULL; } #define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member) static inline bool hlist_bl_unhashed(const struct hlist_bl_node *h) { return !h->pprev; } static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) { return (struct hlist_bl_node *) ((unsigned long)h->first & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_set_first(struct hlist_bl_head *h, struct hlist_bl_node *n) { LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); LIST_BL_BUG_ON(((unsigned long)h->first & LIST_BL_LOCKMASK) != LIST_BL_LOCKMASK); h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); } static inline bool hlist_bl_empty(const struct hlist_bl_head *h) { return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK); } static inline void hlist_bl_add_head(struct hlist_bl_node *n, struct hlist_bl_head *h) { struct hlist_bl_node *first = hlist_bl_first(h); n->next = first; if (first) first->pprev = &n->next; n->pprev = &h->first; hlist_bl_set_first(h, n); } static inline void __hlist_bl_del(struct hlist_bl_node *n) { struct hlist_bl_node *next = n->next; struct hlist_bl_node **pprev = n->pprev; LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); /* pprev may be `first`, so be careful not to lose the lock bit */ WRITE_ONCE(*pprev, (struct hlist_bl_node *) ((unsigned long)next | ((unsigned long)*pprev & LIST_BL_LOCKMASK))); if (next) next->pprev = pprev; } static inline void hlist_bl_del(struct hlist_bl_node *n) { __hlist_bl_del(n); n->next = LIST_POISON1; n->pprev = LIST_POISON2; } static inline void hlist_bl_del_init(struct hlist_bl_node *n) { if (!hlist_bl_unhashed(n)) { __hlist_bl_del(n); INIT_HLIST_BL_NODE(n); } } static inline void hlist_bl_lock(struct hlist_bl_head *b) { bit_spin_lock(0, (unsigned long *)b); } static inline void hlist_bl_unlock(struct hlist_bl_head *b) { __bit_spin_unlock(0, (unsigned long *)b); } static inline bool hlist_bl_is_locked(struct hlist_bl_head *b) { return bit_spin_is_locked(0, (unsigned long *)b); } /** * hlist_bl_for_each_entry - iterate over list of given type * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * */ #define hlist_bl_for_each_entry(tpos, pos, head, member) \ for (pos = hlist_bl_first(head); \ pos && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = pos->next) /** * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @n: another &struct hlist_node to use as temporary storage * @head: the head for your list. * @member: the name of the hlist_node within the struct. */ #define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member) \ for (pos = hlist_bl_first(head); \ pos && ({ n = pos->next; 1; }) && \ ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ pos = n) #endif
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 /* * Cryptographic API. * * Compression operations. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * */ #include <linux/types.h> #include <linux/crypto.h> #include <linux/errno.h> #include <linux/string.h> #include "internal.h" static int crypto_compress(struct crypto_tfm *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen) { return tfm->__crt_alg->cra_compress.coa_compress(tfm, src, slen, dst, dlen); } static int crypto_decompress(struct crypto_tfm *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int *dlen) { return tfm->__crt_alg->cra_compress.coa_decompress(tfm, src, slen, dst, dlen); } int crypto_init_compress_ops(struct crypto_tfm *tfm) { struct compress_tfm *ops = &tfm->crt_compress; ops->cot_compress = crypto_compress; ops->cot_decompress = crypto_decompress; return 0; }
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 /* * ppp_deflate.c - interface the zlib procedures for Deflate compression * and decompression (as used by gzip) to the PPP code. * * Copyright 1994-1998 Paul Mackerras. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/string.h> #include <linux/ppp_defs.h> #include <linux/ppp-comp.h> #include <linux/zlib.h> #include <asm/unaligned.h> /* * State for a Deflate (de)compressor. */ struct ppp_deflate_state { int seqno; int w_size; int unit; int mru; int debug; z_stream strm; struct compstat stats; }; #define DEFLATE_OVHD 2 /* Deflate overhead/packet */ static void *z_comp_alloc(unsigned char *options, int opt_len); static void *z_decomp_alloc(unsigned char *options, int opt_len); static void z_comp_free(void *state); static void z_decomp_free(void *state); static int z_comp_init(void *state, unsigned char *options, int opt_len, int unit, int hdrlen, int debug); static int z_decomp_init(void *state, unsigned char *options, int opt_len, int unit, int hdrlen, int mru, int debug); static int z_compress(void *state, unsigned char *rptr, unsigned char *obuf, int isize, int osize); static void z_incomp(void *state, unsigned char *ibuf, int icnt); static int z_decompress(void *state, unsigned char *ibuf, int isize, unsigned char *obuf, int osize); static void z_comp_reset(void *state); static void z_decomp_reset(void *state); static void z_comp_stats(void *state, struct compstat *stats); /** * z_comp_free - free the memory used by a compressor * @arg: pointer to the private state for the compressor. */ static void z_comp_free(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (state) { zlib_deflateEnd(&state->strm); vfree(state->strm.workspace); kfree(state); } } /** * z_comp_alloc - allocate space for a compressor. * @options: pointer to CCP option data * @opt_len: length of the CCP option at @options. * * The @options pointer points to the a buffer containing the * CCP option data for the compression being negotiated. It is * formatted according to RFC1979, and describes the window * size that the peer is requesting that we use in compressing * data to be sent to it. * * Returns the pointer to the private state for the compressor, * or NULL if we could not allocate enough memory. */ static void *z_comp_alloc(unsigned char *options, int opt_len) { struct ppp_deflate_state *state; int w_size; if (opt_len != CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || options[3] != DEFLATE_CHK_SEQUENCE) return NULL; w_size = DEFLATE_SIZE(options[2]); if (w_size < DEFLATE_MIN_SIZE || w_size > DEFLATE_MAX_SIZE) return NULL; state = kzalloc(sizeof(*state), GFP_KERNEL); if (state == NULL) return NULL; state->strm.next_in = NULL; state->w_size = w_size; state->strm.workspace = vmalloc(zlib_deflate_workspacesize(-w_size, 8)); if (state->strm.workspace == NULL) goto out_free; if (zlib_deflateInit2(&state->strm, Z_DEFAULT_COMPRESSION, DEFLATE_METHOD_VAL, -w_size, 8, Z_DEFAULT_STRATEGY) != Z_OK) goto out_free; return (void *) state; out_free: z_comp_free(state); return NULL; } /** * z_comp_init - initialize a previously-allocated compressor. * @arg: pointer to the private state for the compressor * @options: pointer to the CCP option data describing the * compression that was negotiated with the peer * @opt_len: length of the CCP option data at @options * @unit: PPP unit number for diagnostic messages * @hdrlen: ignored (present for backwards compatibility) * @debug: debug flag; if non-zero, debug messages are printed. * * The CCP options described by @options must match the options * specified when the compressor was allocated. The compressor * history is reset. Returns 0 for failure (CCP options don't * match) or 1 for success. */ static int z_comp_init(void *arg, unsigned char *options, int opt_len, int unit, int hdrlen, int debug) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (opt_len < CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || DEFLATE_SIZE(options[2]) != state->w_size || options[3] != DEFLATE_CHK_SEQUENCE) return 0; state->seqno = 0; state->unit = unit; state->debug = debug; zlib_deflateReset(&state->strm); return 1; } /** * z_comp_reset - reset a previously-allocated compressor. * @arg: pointer to private state for the compressor. * * This clears the history for the compressor and makes it * ready to start emitting a new compressed stream. */ static void z_comp_reset(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; state->seqno = 0; zlib_deflateReset(&state->strm); } /** * z_compress - compress a PPP packet with Deflate compression. * @arg: pointer to private state for the compressor * @rptr: uncompressed packet (input) * @obuf: compressed packet (output) * @isize: size of uncompressed packet * @osize: space available at @obuf * * Returns the length of the compressed packet, or 0 if the * packet is incompressible. */ static int z_compress(void *arg, unsigned char *rptr, unsigned char *obuf, int isize, int osize) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int r, proto, off, olen, oavail; unsigned char *wptr; /* * Check that the protocol is in the range we handle. */ proto = PPP_PROTOCOL(rptr); if (proto > 0x3fff || proto == 0xfd || proto == 0xfb) return 0; /* Don't generate compressed packets which are larger than the uncompressed packet. */ if (osize > isize) osize = isize; wptr = obuf; /* * Copy over the PPP header and store the 2-byte sequence number. */ wptr[0] = PPP_ADDRESS(rptr); wptr[1] = PPP_CONTROL(rptr); put_unaligned_be16(PPP_COMP, wptr + 2); wptr += PPP_HDRLEN; put_unaligned_be16(state->seqno, wptr); wptr += DEFLATE_OVHD; olen = PPP_HDRLEN + DEFLATE_OVHD; state->strm.next_out = wptr; state->strm.avail_out = oavail = osize - olen; ++state->seqno; off = (proto > 0xff) ? 2 : 3; /* skip 1st proto byte if 0 */ rptr += off; state->strm.next_in = rptr; state->strm.avail_in = (isize - off); for (;;) { r = zlib_deflate(&state->strm, Z_PACKET_FLUSH); if (r != Z_OK) { if (state->debug) printk(KERN_ERR "z_compress: deflate returned %d\n", r); break; } if (state->strm.avail_out == 0) { olen += oavail; state->strm.next_out = NULL; state->strm.avail_out = oavail = 1000000; } else { break; /* all done */ } } olen += oavail - state->strm.avail_out; /* * See if we managed to reduce the size of the packet. */ if (olen < isize && olen <= osize) { state->stats.comp_bytes += olen; state->stats.comp_packets++; } else { state->stats.inc_bytes += isize; state->stats.inc_packets++; olen = 0; } state->stats.unc_bytes += isize; state->stats.unc_packets++; return olen; } /** * z_comp_stats - return compression statistics for a compressor * or decompressor. * @arg: pointer to private space for the (de)compressor * @stats: pointer to a struct compstat to receive the result. */ static void z_comp_stats(void *arg, struct compstat *stats) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; *stats = state->stats; } /** * z_decomp_free - Free the memory used by a decompressor. * @arg: pointer to private space for the decompressor. */ static void z_decomp_free(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (state) { zlib_inflateEnd(&state->strm); vfree(state->strm.workspace); kfree(state); } } /** * z_decomp_alloc - allocate space for a decompressor. * @options: pointer to CCP option data * @opt_len: length of the CCP option at @options. * * The @options pointer points to the a buffer containing the * CCP option data for the compression being negotiated. It is * formatted according to RFC1979, and describes the window * size that we are requesting the peer to use in compressing * data to be sent to us. * * Returns the pointer to the private state for the decompressor, * or NULL if we could not allocate enough memory. */ static void *z_decomp_alloc(unsigned char *options, int opt_len) { struct ppp_deflate_state *state; int w_size; if (opt_len != CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || options[3] != DEFLATE_CHK_SEQUENCE) return NULL; w_size = DEFLATE_SIZE(options[2]); if (w_size < DEFLATE_MIN_SIZE || w_size > DEFLATE_MAX_SIZE) return NULL; state = kzalloc(sizeof(*state), GFP_KERNEL); if (state == NULL) return NULL; state->w_size = w_size; state->strm.next_out = NULL; state->strm.workspace = vmalloc(zlib_inflate_workspacesize()); if (state->strm.workspace == NULL) goto out_free; if (zlib_inflateInit2(&state->strm, -w_size) != Z_OK) goto out_free; return (void *) state; out_free: z_decomp_free(state); return NULL; } /** * z_decomp_init - initialize a previously-allocated decompressor. * @arg: pointer to the private state for the decompressor * @options: pointer to the CCP option data describing the * compression that was negotiated with the peer * @opt_len: length of the CCP option data at @options * @unit: PPP unit number for diagnostic messages * @hdrlen: ignored (present for backwards compatibility) * @mru: maximum length of decompressed packets * @debug: debug flag; if non-zero, debug messages are printed. * * The CCP options described by @options must match the options * specified when the decompressor was allocated. The decompressor * history is reset. Returns 0 for failure (CCP options don't * match) or 1 for success. */ static int z_decomp_init(void *arg, unsigned char *options, int opt_len, int unit, int hdrlen, int mru, int debug) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; if (opt_len < CILEN_DEFLATE || (options[0] != CI_DEFLATE && options[0] != CI_DEFLATE_DRAFT) || options[1] != CILEN_DEFLATE || DEFLATE_METHOD(options[2]) != DEFLATE_METHOD_VAL || DEFLATE_SIZE(options[2]) != state->w_size || options[3] != DEFLATE_CHK_SEQUENCE) return 0; state->seqno = 0; state->unit = unit; state->debug = debug; state->mru = mru; zlib_inflateReset(&state->strm); return 1; } /** * z_decomp_reset - reset a previously-allocated decompressor. * @arg: pointer to private state for the decompressor. * * This clears the history for the decompressor and makes it * ready to receive a new compressed stream. */ static void z_decomp_reset(void *arg) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; state->seqno = 0; zlib_inflateReset(&state->strm); } /** * z_decompress - decompress a Deflate-compressed packet. * @arg: pointer to private state for the decompressor * @ibuf: pointer to input (compressed) packet data * @isize: length of input packet * @obuf: pointer to space for output (decompressed) packet * @osize: amount of space available at @obuf * * Because of patent problems, we return DECOMP_ERROR for errors * found by inspecting the input data and for system problems, but * DECOMP_FATALERROR for any errors which could possibly be said to * be being detected "after" decompression. For DECOMP_ERROR, * we can issue a CCP reset-request; for DECOMP_FATALERROR, we may be * infringing a patent of Motorola's if we do, so we take CCP down * instead. * * Given that the frame has the correct sequence number and a good FCS, * errors such as invalid codes in the input most likely indicate a * bug, so we return DECOMP_FATALERROR for them in order to turn off * compression, even though they are detected by inspecting the input. */ static int z_decompress(void *arg, unsigned char *ibuf, int isize, unsigned char *obuf, int osize) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int olen, seq, r; int decode_proto, overflow; unsigned char overflow_buf[1]; if (isize <= PPP_HDRLEN + DEFLATE_OVHD) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: short pkt (%d)\n", state->unit, isize); return DECOMP_ERROR; } /* Check the sequence number. */ seq = get_unaligned_be16(ibuf + PPP_HDRLEN); if (seq != (state->seqno & 0xffff)) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: bad seq # %d, expected %d\n", state->unit, seq, state->seqno & 0xffff); return DECOMP_ERROR; } ++state->seqno; /* * Fill in the first part of the PPP header. The protocol field * comes from the decompressed data. */ obuf[0] = PPP_ADDRESS(ibuf); obuf[1] = PPP_CONTROL(ibuf); obuf[2] = 0; /* * Set up to call inflate. We set avail_out to 1 initially so we can * look at the first byte of the output and decide whether we have * a 1-byte or 2-byte protocol field. */ state->strm.next_in = ibuf + PPP_HDRLEN + DEFLATE_OVHD; state->strm.avail_in = isize - (PPP_HDRLEN + DEFLATE_OVHD); state->strm.next_out = obuf + 3; state->strm.avail_out = 1; decode_proto = 1; overflow = 0; /* * Call inflate, supplying more input or output as needed. */ for (;;) { r = zlib_inflate(&state->strm, Z_PACKET_FLUSH); if (r != Z_OK) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: inflate returned %d (%s)\n", state->unit, r, (state->strm.msg? state->strm.msg: "")); return DECOMP_FATALERROR; } if (state->strm.avail_out != 0) break; /* all done */ if (decode_proto) { state->strm.avail_out = osize - PPP_HDRLEN; if ((obuf[3] & 1) == 0) { /* 2-byte protocol field */ obuf[2] = obuf[3]; --state->strm.next_out; ++state->strm.avail_out; } decode_proto = 0; } else if (!overflow) { /* * We've filled up the output buffer; the only way to * find out whether inflate has any more characters * left is to give it another byte of output space. */ state->strm.next_out = overflow_buf; state->strm.avail_out = 1; overflow = 1; } else { if (state->debug) printk(KERN_DEBUG "z_decompress%d: ran out of mru\n", state->unit); return DECOMP_FATALERROR; } } if (decode_proto) { if (state->debug) printk(KERN_DEBUG "z_decompress%d: didn't get proto\n", state->unit); return DECOMP_ERROR; } olen = osize + overflow - state->strm.avail_out; state->stats.unc_bytes += olen; state->stats.unc_packets++; state->stats.comp_bytes += isize; state->stats.comp_packets++; return olen; } /** * z_incomp - add incompressible input data to the history. * @arg: pointer to private state for the decompressor * @ibuf: pointer to input packet data * @icnt: length of input data. */ static void z_incomp(void *arg, unsigned char *ibuf, int icnt) { struct ppp_deflate_state *state = (struct ppp_deflate_state *) arg; int proto, r; /* * Check that the protocol is one we handle. */ proto = PPP_PROTOCOL(ibuf); if (proto > 0x3fff || proto == 0xfd || proto == 0xfb) return; ++state->seqno; /* * We start at the either the 1st or 2nd byte of the protocol field, * depending on whether the protocol value is compressible. */ state->strm.next_in = ibuf + 3; state->strm.avail_in = icnt - 3; if (proto > 0xff) { --state->strm.next_in; ++state->strm.avail_in; } r = zlib_inflateIncomp(&state->strm); if (r != Z_OK) { /* gak! */ if (state->debug) { printk(KERN_DEBUG "z_incomp%d: inflateIncomp returned %d (%s)\n", state->unit, r, (state->strm.msg? state->strm.msg: "")); } return; } /* * Update stats. */ state->stats.inc_bytes += icnt; state->stats.inc_packets++; state->stats.unc_bytes += icnt; state->stats.unc_packets++; } /************************************************************* * Module interface table *************************************************************/ /* These are in ppp_generic.c */ extern int ppp_register_compressor (struct compressor *cp); extern void ppp_unregister_compressor (struct compressor *cp); /* * Procedures exported to if_ppp.c. */ static struct compressor ppp_deflate = { .compress_proto = CI_DEFLATE, .comp_alloc = z_comp_alloc, .comp_free = z_comp_free, .comp_init = z_comp_init, .comp_reset = z_comp_reset, .compress = z_compress, .comp_stat = z_comp_stats, .decomp_alloc = z_decomp_alloc, .decomp_free = z_decomp_free, .decomp_init = z_decomp_init, .decomp_reset = z_decomp_reset, .decompress = z_decompress, .incomp = z_incomp, .decomp_stat = z_comp_stats, .owner = THIS_MODULE }; static struct compressor ppp_deflate_draft = { .compress_proto = CI_DEFLATE_DRAFT, .comp_alloc = z_comp_alloc, .comp_free = z_comp_free, .comp_init = z_comp_init, .comp_reset = z_comp_reset, .compress = z_compress, .comp_stat = z_comp_stats, .decomp_alloc = z_decomp_alloc, .decomp_free = z_decomp_free, .decomp_init = z_decomp_init, .decomp_reset = z_decomp_reset, .decompress = z_decompress, .incomp = z_incomp, .decomp_stat = z_comp_stats, .owner = THIS_MODULE }; static int __init deflate_init(void) { int rc; rc = ppp_register_compressor(&ppp_deflate); if (rc) return rc; rc = ppp_register_compressor(&ppp_deflate_draft); if (rc) { ppp_unregister_compressor(&ppp_deflate); return rc; } pr_info("PPP Deflate Compression module registered\n"); return 0; } static void __exit deflate_cleanup(void) { ppp_unregister_compressor(&ppp_deflate); ppp_unregister_compressor(&ppp_deflate_draft); } module_init(deflate_init); module_exit(deflate_cleanup); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS("ppp-compress-" __stringify(CI_DEFLATE)); MODULE_ALIAS("ppp-compress-" __stringify(CI_DEFLATE_DRAFT));
28 28 16 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 // SPDX-License-Identifier: GPL-2.0 /* -*- linux-c -*- * sysctl_net_core.c: sysctl interface to net core subsystem. * * Begun April 1, 1996, Mike Shaver. * Added /proc/sys/net/core directory entry (empty =) ). [MS] */ #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/module.h> #include <linux/socket.h> #include <linux/netdevice.h> #include <linux/ratelimit.h> #include <linux/vmalloc.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/kmemleak.h> #include <net/ip.h> #include <net/sock.h> #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> static int zero = 0; static int one = 1; static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; static long long_one __maybe_unused = 1; static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; struct ctl_table tmp = { .data = &size, .maxlen = sizeof(size), .mode = table->mode }; struct rps_sock_flow_table *orig_sock_table, *sock_table; static DEFINE_MUTEX(sock_flow_mutex); mutex_lock(&sock_flow_mutex); orig_sock_table = rcu_dereference_protected(rps_sock_flow_table, lockdep_is_held(&sock_flow_mutex)); size = orig_size = orig_sock_table ? orig_sock_table->mask + 1 : 0; ret = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (write) { if (size) { if (size > 1<<29) { /* Enforce limit to prevent overflow */ mutex_unlock(&sock_flow_mutex); return -EINVAL; } size = roundup_pow_of_two(size); if (size != orig_size) { sock_table = vmalloc(RPS_SOCK_FLOW_TABLE_SIZE(size)); if (!sock_table) { mutex_unlock(&sock_flow_mutex); return -ENOMEM; } rps_cpu_mask = roundup_pow_of_two(nr_cpu_ids) - 1; sock_table->mask = size - 1; } else sock_table = orig_sock_table; for (i = 0; i < size; i++) sock_table->ents[i] = RPS_NO_CPU; } else sock_table = NULL; if (sock_table != orig_sock_table) { rcu_assign_pointer(rps_sock_flow_table, sock_table); if (sock_table) { static_key_slow_inc(&rps_needed); static_key_slow_inc(&rfs_needed); } if (orig_sock_table) { static_key_slow_dec(&rps_needed); static_key_slow_dec(&rfs_needed); synchronize_rcu(); vfree(orig_sock_table); } } } mutex_unlock(&sock_flow_mutex); return ret; } #endif /* CONFIG_RPS */ #ifdef CONFIG_NET_FLOW_LIMIT static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; cpumask_var_t mask; int i, len, ret = 0; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; if (write) { ret = cpumask_parse_user(buffer, *lenp, mask); if (ret) goto done; mutex_lock(&flow_limit_update_mutex); len = sizeof(*cur) + netdev_flow_limit_table_len; for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); cur = rcu_dereference_protected(sd->flow_limit, lockdep_is_held(&flow_limit_update_mutex)); if (cur && !cpumask_test_cpu(i, mask)) { RCU_INIT_POINTER(sd->flow_limit, NULL); synchronize_rcu(); kfree(cur); } else if (!cur && cpumask_test_cpu(i, mask)) { cur = kzalloc_node(len, GFP_KERNEL, cpu_to_node(i)); if (!cur) { /* not unwinding previous changes */ ret = -ENOMEM; goto write_unlock; } cur->num_buckets = netdev_flow_limit_table_len; rcu_assign_pointer(sd->flow_limit, cur); } } write_unlock: mutex_unlock(&flow_limit_update_mutex); } else { char kbuf[128]; if (*ppos || !*lenp) { *lenp = 0; goto done; } cpumask_clear(mask); rcu_read_lock(); for_each_possible_cpu(i) { sd = &per_cpu(softnet_data, i); if (rcu_dereference(sd->flow_limit)) cpumask_set_cpu(i, mask); } rcu_read_unlock(); len = min(sizeof(kbuf) - 1, *lenp); len = scnprintf(kbuf, len, "%*pb", cpumask_pr_args(mask)); if (!len) { *lenp = 0; goto done; } if (len < *lenp) kbuf[len++] = '\n'; if (copy_to_user(buffer, kbuf, len)) { ret = -EFAULT; goto done; } *lenp = len; *ppos += len; } done: free_cpumask_var(mask); return ret; } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; mutex_lock(&flow_limit_update_mutex); ptr = table->data; old = *ptr; ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write && !is_power_of_2(*ptr)) { *ptr = old; ret = -EINVAL; } mutex_unlock(&flow_limit_update_mutex); return ret; } #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { .data = id, .maxlen = IFNAMSIZ, }; int ret; qdisc_get_default(id, IFNAMSIZ); ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = qdisc_set_default(id); return ret; } #endif static int proc_do_dev_weight(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { static DEFINE_MUTEX(dev_weight_mutex); int ret, weight; mutex_lock(&dev_weight_mutex); ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { weight = READ_ONCE(weight_p); WRITE_ONCE(dev_rx_weight, weight * dev_weight_rx_bias); WRITE_ONCE(dev_tx_weight, weight * dev_weight_tx_bias); } mutex_unlock(&dev_weight_mutex); return ret; } static int proc_do_rss_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); fake_table.data = buf; fake_table.maxlen = sizeof(buf); return proc_dostring(&fake_table, write, buffer, lenp, ppos); } #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; struct ctl_table tmp = *table; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; tmp.data = &jit_enable; ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { *(int *)table->data = jit_enable; if (jit_enable == 2) pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); } return ret; } # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } # endif /* CONFIG_HAVE_EBPF_JIT */ static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { .procname = "wmem_max", .data = &sysctl_wmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_max", .data = &sysctl_rmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, { .procname = "wmem_default", .data = &sysctl_wmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_sndbuf, }, { .procname = "rmem_default", .data = &sysctl_rmem_default, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &min_rcvbuf, }, { .procname = "dev_weight", .data = &weight_p, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "dev_weight_rx_bias", .data = &dev_weight_rx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "dev_weight_tx_bias", .data = &dev_weight_tx_bias, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_do_dev_weight, }, { .procname = "netdev_max_backlog", .data = &netdev_max_backlog, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "netdev_rss_key", .data = &netdev_rss_key, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_do_rss_key, }, #ifdef CONFIG_BPF_JIT { .procname = "bpf_jit_enable", .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax_bpf_enable, # ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = &one, .extra2 = &one, # else .extra1 = &zero, .extra2 = &two, # endif }, # ifdef CONFIG_HAVE_EBPF_JIT { .procname = "bpf_jit_harden", .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = &zero, .extra2 = &two, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, .proc_handler = proc_dointvec_minmax_bpf_restricted, .extra1 = &zero, .extra2 = &one, }, # endif { .procname = "bpf_jit_limit", .data = &bpf_jit_limit, .maxlen = sizeof(long), .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = &long_one, .extra2 = &bpf_jit_limit_max, }, #endif { .procname = "netdev_tstamp_prequeue", .data = &netdev_tstamp_prequeue, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "message_cost", .data = &net_ratelimit_state.interval, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "message_burst", .data = &net_ratelimit_state.burst, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "optmem_max", .data = &sysctl_optmem_max, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "tstamp_allow_data", .data = &sysctl_tstamp_allow_data, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, .extra2 = &one }, #ifdef CONFIG_RPS { .procname = "rps_sock_flow_entries", .maxlen = sizeof(int), .mode = 0644, .proc_handler = rps_sock_flow_sysctl }, #endif #ifdef CONFIG_NET_FLOW_LIMIT { .procname = "flow_limit_cpu_bitmap", .mode = 0644, .proc_handler = flow_limit_cpu_sysctl }, { .procname = "flow_limit_table_len", .data = &netdev_flow_limit_table_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = flow_limit_table_len_sysctl }, #endif /* CONFIG_NET_FLOW_LIMIT */ #ifdef CONFIG_NET_RX_BUSY_POLL { .procname = "busy_poll", .data = &sysctl_net_busy_poll, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, { .procname = "busy_read", .data = &sysctl_net_busy_read, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, #endif #ifdef CONFIG_NET_SCHED { .procname = "default_qdisc", .mode = 0644, .maxlen = IFNAMSIZ, .proc_handler = set_default_qdisc }, #endif #endif /* CONFIG_NET */ { .procname = "netdev_budget", .data = &netdev_budget, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "warnings", .data = &net_msg_warn, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_skb_frags", .data = &sysctl_max_skb_frags, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &one, .extra2 = &max_skb_frags, }, { .procname = "netdev_budget_usecs", .data = &netdev_budget_usecs, .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, { } }; static struct ctl_table netns_core_table[] = { { .procname = "somaxconn", .data = &init_net.core.sysctl_somaxconn, .maxlen = sizeof(int), .mode = 0644, .extra1 = &zero, .proc_handler = proc_dointvec_minmax }, { } }; static __net_init int sysctl_core_net_init(struct net *net) { struct ctl_table *tbl; tbl = netns_core_table; if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(netns_core_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; tbl[0].data = &net->core.sysctl_somaxconn; /* Don't export any sysctls to unprivileged users */ if (net->user_ns != &init_user_ns) { tbl[0].procname = NULL; } } net->core.sysctl_hdr = register_net_sysctl(net, "net/core", tbl); if (net->core.sysctl_hdr == NULL) goto err_reg; return 0; err_reg: if (tbl != netns_core_table) kfree(tbl); err_dup: return -ENOMEM; } static __net_exit void sysctl_core_net_exit(struct net *net) { struct ctl_table *tbl; tbl = net->core.sysctl_hdr->ctl_table_arg; unregister_net_sysctl_table(net->core.sysctl_hdr); BUG_ON(tbl == netns_core_table); kfree(tbl); } static __net_initdata struct pernet_operations sysctl_core_ops = { .init = sysctl_core_net_init, .exit = sysctl_core_net_exit, }; static __init int sysctl_core_init(void) { register_net_sysctl(&init_net, "net/core", net_core_table); return register_pernet_subsys(&sysctl_core_ops); } fs_initcall(sysctl_core_init);
231 96 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 /* * Bridge per vlan tunnel port dst_metadata handling code * * Authors: * Roopa Prabhu <roopa@cumulusnetworks.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <net/switchdev.h> #include <net/dst_metadata.h> #include "br_private.h" #include "br_private_tunnel.h" static inline int br_vlan_tunid_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct net_bridge_vlan *vle = ptr; __be64 tunid = *(__be64 *)arg->key; return vle->tinfo.tunnel_id != tunid; } static const struct rhashtable_params br_vlan_tunnel_rht_params = { .head_offset = offsetof(struct net_bridge_vlan, tnode), .key_offset = offsetof(struct net_bridge_vlan, tinfo.tunnel_id), .key_len = sizeof(__be64), .nelem_hint = 3, .locks_mul = 1, .obj_cmpfn = br_vlan_tunid_cmp, .automatic_shrinking = true, }; static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, u64 tunnel_id) { return rhashtable_lookup_fast(tbl, &tunnel_id, br_vlan_tunnel_rht_params); } static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan) { struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst); WRITE_ONCE(vlan->tinfo.tunnel_id, 0); RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL); dst_release(&tdst->dst); } void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan) { if (!rcu_access_pointer(vlan->tinfo.tunnel_dst)) return; rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); vlan_tunnel_info_release(vlan); } static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u32 tun_id) { struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); int err; if (metadata) return -EEXIST; metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, key, 0); if (!metadata) return -EINVAL; metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata); WRITE_ONCE(vlan->tinfo.tunnel_id, key); err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); if (err) goto out; return 0; out: vlan_tunnel_info_release(vlan); return err; } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_tunnel_info_add(struct net_bridge_port *port, u16 vid, u32 tun_id) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *vlan; ASSERT_RTNL(); vg = nbp_vlan_group(port); vlan = br_vlan_find(vg, vid); if (!vlan) return -EINVAL; return __vlan_tunnel_info_add(vg, vlan, tun_id); } /* Must be protected by RTNL. * Must be called with vid in range from 1 to 4094 inclusive. */ int nbp_vlan_tunnel_info_delete(struct net_bridge_port *port, u16 vid) { struct net_bridge_vlan_group *vg; struct net_bridge_vlan *v; ASSERT_RTNL(); vg = nbp_vlan_group(port); v = br_vlan_find(vg, vid); if (!v) return -ENOENT; vlan_tunnel_info_del(vg, v); return 0; } static void __vlan_tunnel_info_flush(struct net_bridge_vlan_group *vg) { struct net_bridge_vlan *vlan, *tmp; list_for_each_entry_safe(vlan, tmp, &vg->vlan_list, vlist) vlan_tunnel_info_del(vg, vlan); } void nbp_vlan_tunnel_info_flush(struct net_bridge_port *port) { struct net_bridge_vlan_group *vg; ASSERT_RTNL(); vg = nbp_vlan_group(port); __vlan_tunnel_info_flush(vg); } int vlan_tunnel_init(struct net_bridge_vlan_group *vg) { return rhashtable_init(&vg->tunnel_hash, &br_vlan_tunnel_rht_params); } void vlan_tunnel_deinit(struct net_bridge_vlan_group *vg) { rhashtable_destroy(&vg->tunnel_hash); } int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_port *p, struct net_bridge_vlan_group *vg) { struct ip_tunnel_info *tinfo = skb_tunnel_info(skb); struct net_bridge_vlan *vlan; if (!vg || !tinfo) return 0; /* if already tagged, ignore */ if (skb_vlan_tagged(skb)) return 0; /* lookup vid, given tunnel id */ vlan = br_vlan_tunnel_lookup(&vg->tunnel_hash, tinfo->key.tun_id); if (!vlan) return 0; skb_dst_drop(skb); __vlan_hwaccel_put_tag(skb, p->br->vlan_proto, vlan->vid); return 0; } int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { struct metadata_dst *tunnel_dst; __be64 tunnel_id; int err; if (!vlan) return 0; tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id); if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb))) return 0; skb_dst_drop(skb); err = skb_vlan_pop(skb); if (err) return err; tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) skb_dst_set(skb, &tunnel_dst->dst); return 0; }
11 4 4 5 4 3 3 3 3 3 11 7 7 7 18 21 21 12 11 2 12 2 1 3 19 19 1 3 14 8 25 3 3 3 8 7 1 1 4 4 4 1 4 3 1 1 1 4 1 1 13 4 4 13 7 10 8 4 3 14 13 12 14 9 1 9 9 9 9 1 8 7 6 2 1 1 1 4 9 8 7 9 1 4 2 2 1 1 1 1 6 5 15 1 7 1 6 1 3 1 5 1 4 3 1 2 14 16 7 39 38 25 38 38 38 38 13 39 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 /* * IEEE802154.4 socket interface * * Copyright 2007, 2008 Siemens AG * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 * as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * Written by: * Sergey Lapin <slapin@ossfans.org> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> */ #include <linux/net.h> #include <linux/capability.h> #include <linux/module.h> #include <linux/if_arp.h> #include <linux/if.h> #include <linux/termios.h> /* For TIOCOUTQ/INQ */ #include <linux/list.h> #include <linux/slab.h> #include <net/datalink.h> #include <net/psnap.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/route.h> #include <net/af_ieee802154.h> #include <net/ieee802154_netdev.h> /* Utility function for families */ static struct net_device* ieee802154_get_dev(struct net *net, const struct ieee802154_addr *addr) { struct net_device *dev = NULL; struct net_device *tmp; __le16 pan_id, short_addr; u8 hwaddr[IEEE802154_ADDR_LEN]; switch (addr->mode) { case IEEE802154_ADDR_LONG: ieee802154_devaddr_to_raw(hwaddr, addr->extended_addr); rcu_read_lock(); dev = dev_getbyhwaddr_rcu(net, ARPHRD_IEEE802154, hwaddr); if (dev) dev_hold(dev); rcu_read_unlock(); break; case IEEE802154_ADDR_SHORT: if (addr->pan_id == cpu_to_le16(IEEE802154_PANID_BROADCAST) || addr->short_addr == cpu_to_le16(IEEE802154_ADDR_UNDEF) || addr->short_addr == cpu_to_le16(IEEE802154_ADDR_BROADCAST)) break; rtnl_lock(); for_each_netdev(net, tmp) { if (tmp->type != ARPHRD_IEEE802154) continue; pan_id = tmp->ieee802154_ptr->pan_id; short_addr = tmp->ieee802154_ptr->short_addr; if (pan_id == addr->pan_id && short_addr == addr->short_addr) { dev = tmp; dev_hold(dev); break; } } rtnl_unlock(); break; default: pr_warn("Unsupported ieee802154 address type: %d\n", addr->mode); break; } return dev; } static int ieee802154_sock_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { sock->sk = NULL; sk->sk_prot->close(sk, 0); } return 0; } static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; return sk->sk_prot->sendmsg(sk, msg, len); } static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; if (sk->sk_prot->bind) return sk->sk_prot->bind(sk, uaddr, addr_len); return sock_no_bind(sock, uaddr, addr_len); } static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); return sk->sk_prot->connect(sk, uaddr, addr_len); } static int ieee802154_dev_ioctl(struct sock *sk, struct ifreq __user *arg, unsigned int cmd) { struct ifreq ifr; int ret = -ENOIOCTLCMD; struct net_device *dev; if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) return -EFAULT; ifr.ifr_name[IFNAMSIZ-1] = 0; dev_load(sock_net(sk), ifr.ifr_name); dev = dev_get_by_name(sock_net(sk), ifr.ifr_name); if (!dev) return -ENODEV; if (dev->type == ARPHRD_IEEE802154 && dev->netdev_ops->ndo_do_ioctl) ret = dev->netdev_ops->ndo_do_ioctl(dev, &ifr, cmd); if (!ret && copy_to_user(arg, &ifr, sizeof(struct ifreq))) ret = -EFAULT; dev_put(dev); return ret; } static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; switch (cmd) { case SIOCGSTAMP: return sock_get_timestamp(sk, (struct timeval __user *)arg); case SIOCGSTAMPNS: return sock_get_timestampns(sk, (struct timespec __user *)arg); case SIOCGIFADDR: case SIOCSIFADDR: return ieee802154_dev_ioctl(sk, (struct ifreq __user *)arg, cmd); default: if (!sk->sk_prot->ioctl) return -ENOIOCTLCMD; return sk->sk_prot->ioctl(sk, cmd, arg); } } /* RAW Sockets (802.15.4 created in userspace) */ static HLIST_HEAD(raw_head); static DEFINE_RWLOCK(raw_lock); static int raw_hash(struct sock *sk) { write_lock_bh(&raw_lock); sk_add_node(sk, &raw_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&raw_lock); return 0; } static void raw_unhash(struct sock *sk) { write_lock_bh(&raw_lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&raw_lock); } static void raw_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) { struct ieee802154_addr addr; struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr; int err = 0; struct net_device *dev = NULL; err = ieee802154_sockaddr_check_size(uaddr, len); if (err < 0) return err; uaddr = (struct sockaddr_ieee802154 *)_uaddr; if (uaddr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); ieee802154_addr_from_sa(&addr, &uaddr->addr); dev = ieee802154_get_dev(sock_net(sk), &addr); if (!dev) { err = -ENODEV; goto out; } sk->sk_bound_dev_if = dev->ifindex; sk_dst_reset(sk); dev_put(dev); out: release_sock(sk); return err; } static int raw_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { return -ENOTSUPP; } static int raw_disconnect(struct sock *sk, int flags) { return 0; } static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; struct sk_buff *skb; int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); return -EOPNOTSUPP; } lock_sock(sk); if (!sk->sk_bound_dev_if) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else dev = dev_get_by_index(sock_net(sk), sk->sk_bound_dev_if); release_sock(sk); if (!dev) { pr_debug("no dev\n"); err = -ENXIO; goto out; } mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } if (!size) { err = 0; goto out_dev; } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; skb_reserve(skb, hlen); skb_reset_mac_header(skb); skb_reset_network_header(skb); err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); dev_put(dev); return err ?: size; out_skb: kfree_skb(skb); out_dev: dev_put(dev); out: return err; } static int raw_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_ts_and_drops(msg, sk, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } static void ieee802154_raw_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk; read_lock(&raw_lock); sk_for_each(sk, &raw_head) { bh_lock_sock(sk); if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dev->ifindex) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); if (clone) raw_rcv_skb(sk, clone); } bh_unlock_sock(sk); } read_unlock(&raw_lock); } static int raw_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { return -EOPNOTSUPP; } static int raw_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { return -EOPNOTSUPP; } static struct proto ieee802154_raw_prot = { .name = "IEEE-802.15.4-RAW", .owner = THIS_MODULE, .obj_size = sizeof(struct sock), .close = raw_close, .bind = raw_bind, .sendmsg = raw_sendmsg, .recvmsg = raw_recvmsg, .hash = raw_hash, .unhash = raw_unhash, .connect = raw_connect, .disconnect = raw_disconnect, .getsockopt = raw_getsockopt, .setsockopt = raw_setsockopt, }; static const struct proto_ops ieee802154_raw_ops = { .family = PF_IEEE802154, .owner = THIS_MODULE, .release = ieee802154_sock_release, .bind = ieee802154_sock_bind, .connect = ieee802154_sock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif }; /* DGRAM Sockets (802.15.4 dataframes) */ static HLIST_HEAD(dgram_head); static DEFINE_RWLOCK(dgram_lock); struct dgram_sock { struct sock sk; struct ieee802154_addr src_addr; struct ieee802154_addr dst_addr; unsigned int bound:1; unsigned int connected:1; unsigned int want_ack:1; unsigned int secen:1; unsigned int secen_override:1; unsigned int seclevel:3; unsigned int seclevel_override:1; }; static inline struct dgram_sock *dgram_sk(const struct sock *sk) { return container_of(sk, struct dgram_sock, sk); } static int dgram_hash(struct sock *sk) { write_lock_bh(&dgram_lock); sk_add_node(sk, &dgram_head); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); write_unlock_bh(&dgram_lock); return 0; } static void dgram_unhash(struct sock *sk) { write_lock_bh(&dgram_lock); if (sk_del_node_init(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); write_unlock_bh(&dgram_lock); } static int dgram_init(struct sock *sk) { struct dgram_sock *ro = dgram_sk(sk); ro->want_ack = 1; return 0; } static void dgram_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct ieee802154_addr haddr; struct dgram_sock *ro = dgram_sk(sk); int err = -EINVAL; struct net_device *dev; lock_sock(sk); ro->bound = 0; err = ieee802154_sockaddr_check_size(addr, len); if (err < 0) goto out; if (addr->family != AF_IEEE802154) { err = -EINVAL; goto out; } ieee802154_addr_from_sa(&haddr, &addr->addr); dev = ieee802154_get_dev(sock_net(sk), &haddr); if (!dev) { err = -ENODEV; goto out; } if (dev->type != ARPHRD_IEEE802154) { err = -ENODEV; goto out_put; } ro->src_addr = haddr; ro->bound = 1; err = 0; out_put: dev_put(dev); out: release_sock(sk); return err; } static int dgram_ioctl(struct sock *sk, int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: { int amount = sk_wmem_alloc_get(sk); return put_user(amount, (int __user *)arg); } case SIOCINQ: { struct sk_buff *skb; unsigned long amount; amount = 0; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) { /* We will only return the amount * of this packet since that is all * that will be read. */ amount = skb->len - ieee802154_hdr_length(skb); } spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } } return -ENOIOCTLCMD; } /* FIXME: autobind */ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct dgram_sock *ro = dgram_sk(sk); int err = 0; err = ieee802154_sockaddr_check_size(addr, len); if (err < 0) return err; if (addr->family != AF_IEEE802154) return -EINVAL; lock_sock(sk); if (!ro->bound) { err = -ENETUNREACH; goto out; } ieee802154_addr_from_sa(&ro->dst_addr, &addr->addr); ro->connected = 1; out: release_sock(sk); return err; } static int dgram_disconnect(struct sock *sk, int flags) { struct dgram_sock *ro = dgram_sk(sk); lock_sock(sk); ro->connected = 0; release_sock(sk); return 0; } static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct net_device *dev; unsigned int mtu; struct sk_buff *skb; struct ieee802154_mac_cb *cb; struct dgram_sock *ro = dgram_sk(sk); struct ieee802154_addr dst_addr; DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); int hlen, tlen; int err; if (msg->msg_flags & MSG_OOB) { pr_debug("msg->msg_flags = 0x%x\n", msg->msg_flags); return -EOPNOTSUPP; } if (msg->msg_name) { if (ro->connected) return -EISCONN; if (msg->msg_namelen < IEEE802154_MIN_NAMELEN) return -EINVAL; err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen); if (err < 0) return err; ieee802154_addr_from_sa(&dst_addr, &daddr->addr); } else { if (!ro->connected) return -EDESTADDRREQ; dst_addr = ro->dst_addr; } if (!ro->bound) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); else dev = ieee802154_get_dev(sock_net(sk), &ro->src_addr); if (!dev) { pr_debug("no dev\n"); err = -ENXIO; goto out; } mtu = IEEE802154_MTU; pr_debug("name = %s, mtu = %u\n", dev->name, mtu); if (size > mtu) { pr_debug("size = %zu, mtu = %u\n", size, mtu); err = -EMSGSIZE; goto out_dev; } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; skb = sock_alloc_send_skb(sk, hlen + tlen + size, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) goto out_dev; skb_reserve(skb, hlen); skb_reset_network_header(skb); cb = mac_cb_init(skb); cb->type = IEEE802154_FC_TYPE_DATA; cb->ackreq = ro->want_ack; cb->secen = ro->secen; cb->secen_override = ro->secen_override; cb->seclevel = ro->seclevel; cb->seclevel_override = ro->seclevel_override; err = wpan_dev_hard_header(skb, dev, &dst_addr, ro->bound ? &ro->src_addr : NULL, size); if (err < 0) goto out_skb; err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto out_skb; skb->dev = dev; skb->protocol = htons(ETH_P_IEEE802154); err = dev_queue_xmit(skb); if (err > 0) err = net_xmit_errno(err); dev_put(dev); return err ?: size; out_skb: kfree_skb(skb); out_dev: dev_put(dev); out: return err; } static int dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { size_t copied = 0; int err = -EOPNOTSUPP; struct sk_buff *skb; DECLARE_SOCKADDR(struct sockaddr_ieee802154 *, saddr, msg->msg_name); skb = skb_recv_datagram(sk, flags, noblock, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } /* FIXME: skip headers if necessary ?! */ err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_ts_and_drops(msg, sk, skb); if (saddr) { /* Clear the implicit padding in struct sockaddr_ieee802154 * (16 bits between 'family' and 'addr') and in struct * ieee802154_addr_sa (16 bits at the end of the structure). */ memset(saddr, 0, sizeof(*saddr)); saddr->family = AF_IEEE802154; ieee802154_addr_to_sa(&saddr->addr, &mac_cb(skb)->source); *addr_len = sizeof(*saddr); } if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: if (err) return err; return copied; } static int dgram_rcv_skb(struct sock *sk, struct sk_buff *skb) { skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return NET_RX_DROP; if (sock_queue_rcv_skb(sk, skb) < 0) { kfree_skb(skb); return NET_RX_DROP; } return NET_RX_SUCCESS; } static inline bool ieee802154_match_sock(__le64 hw_addr, __le16 pan_id, __le16 short_addr, struct dgram_sock *ro) { if (!ro->bound) return true; if (ro->src_addr.mode == IEEE802154_ADDR_LONG && hw_addr == ro->src_addr.extended_addr) return true; if (ro->src_addr.mode == IEEE802154_ADDR_SHORT && pan_id == ro->src_addr.pan_id && short_addr == ro->src_addr.short_addr) return true; return false; } static int ieee802154_dgram_deliver(struct net_device *dev, struct sk_buff *skb) { struct sock *sk, *prev = NULL; int ret = NET_RX_SUCCESS; __le16 pan_id, short_addr; __le64 hw_addr; /* Data frame processing */ BUG_ON(dev->type != ARPHRD_IEEE802154); pan_id = dev->ieee802154_ptr->pan_id; short_addr = dev->ieee802154_ptr->short_addr; hw_addr = dev->ieee802154_ptr->extended_addr; read_lock(&dgram_lock); sk_for_each(sk, &dgram_head) { if (ieee802154_match_sock(hw_addr, pan_id, short_addr, dgram_sk(sk))) { if (prev) { struct sk_buff *clone; clone = skb_clone(skb, GFP_ATOMIC); if (clone) dgram_rcv_skb(prev, clone); } prev = sk; } } if (prev) { dgram_rcv_skb(prev, skb); } else { kfree_skb(skb); ret = NET_RX_DROP; } read_unlock(&dgram_lock); return ret; } static int dgram_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct dgram_sock *ro = dgram_sk(sk); int val, len; if (level != SOL_IEEE802154) return -EOPNOTSUPP; if (get_user(len, optlen)) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case WPAN_WANTACK: val = ro->want_ack; break; case WPAN_SECURITY: if (!ro->secen_override) val = WPAN_SECURITY_DEFAULT; else if (ro->secen) val = WPAN_SECURITY_ON; else val = WPAN_SECURITY_OFF; break; case WPAN_SECURITY_LEVEL: if (!ro->seclevel_override) val = WPAN_SECURITY_LEVEL_DEFAULT; else val = ro->seclevel; break; default: return -ENOPROTOOPT; } if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int dgram_setsockopt(struct sock *sk, int level, int optname, char __user *optval, unsigned int optlen) { struct dgram_sock *ro = dgram_sk(sk); struct net *net = sock_net(sk); int val; int err = 0; if (optlen < sizeof(int)) return -EINVAL; if (get_user(val, (int __user *)optval)) return -EFAULT; lock_sock(sk); switch (optname) { case WPAN_WANTACK: ro->want_ack = !!val; break; case WPAN_SECURITY: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; break; } switch (val) { case WPAN_SECURITY_DEFAULT: ro->secen_override = 0; break; case WPAN_SECURITY_ON: ro->secen_override = 1; ro->secen = 1; break; case WPAN_SECURITY_OFF: ro->secen_override = 1; ro->secen = 0; break; default: err = -EINVAL; break; } break; case WPAN_SECURITY_LEVEL: if (!ns_capable(net->user_ns, CAP_NET_ADMIN) && !ns_capable(net->user_ns, CAP_NET_RAW)) { err = -EPERM; break; } if (val < WPAN_SECURITY_LEVEL_DEFAULT || val > IEEE802154_SCF_SECLEVEL_ENC_MIC128) { err = -EINVAL; } else if (val == WPAN_SECURITY_LEVEL_DEFAULT) { ro->seclevel_override = 0; } else { ro->seclevel_override = 1; ro->seclevel = val; } break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static struct proto ieee802154_dgram_prot = { .name = "IEEE-802.15.4-MAC", .owner = THIS_MODULE, .obj_size = sizeof(struct dgram_sock), .init = dgram_init, .close = dgram_close, .bind = dgram_bind, .sendmsg = dgram_sendmsg, .recvmsg = dgram_recvmsg, .hash = dgram_hash, .unhash = dgram_unhash, .connect = dgram_connect, .disconnect = dgram_disconnect, .ioctl = dgram_ioctl, .getsockopt = dgram_getsockopt, .setsockopt = dgram_setsockopt, }; static const struct proto_ops ieee802154_dgram_ops = { .family = PF_IEEE802154, .owner = THIS_MODULE, .release = ieee802154_sock_release, .bind = ieee802154_sock_bind, .connect = ieee802154_sock_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .poll = datagram_poll, .ioctl = ieee802154_sock_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = ieee802154_sock_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_sock_common_setsockopt, .compat_getsockopt = compat_sock_common_getsockopt, #endif }; static void ieee802154_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); } /* Create a socket. Initialise the socket, blank the addresses * set the state. */ static int ieee802154_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; int rc; struct proto *proto; const struct proto_ops *ops; if (!net_eq(net, &init_net)) return -EAFNOSUPPORT; switch (sock->type) { case SOCK_RAW: rc = -EPERM; if (!capable(CAP_NET_RAW)) goto out; proto = &ieee802154_raw_prot; ops = &ieee802154_raw_ops; break; case SOCK_DGRAM: proto = &ieee802154_dgram_prot; ops = &ieee802154_dgram_ops; break; default: rc = -ESOCKTNOSUPPORT; goto out; } rc = -ENOMEM; sk = sk_alloc(net, PF_IEEE802154, GFP_KERNEL, proto, kern); if (!sk) goto out; rc = 0; sock->ops = ops; sock_init_data(sock, sk); sk->sk_destruct = ieee802154_sock_destruct; sk->sk_family = PF_IEEE802154; /* Checksums on by default */ sock_set_flag(sk, SOCK_ZAPPED); if (sk->sk_prot->hash) { rc = sk->sk_prot->hash(sk); if (rc) { sk_common_release(sk); goto out; } } if (sk->sk_prot->init) { rc = sk->sk_prot->init(sk); if (rc) sk_common_release(sk); } out: return rc; } static const struct net_proto_family ieee802154_family_ops = { .family = PF_IEEE802154, .create = ieee802154_create, .owner = THIS_MODULE, }; static int ieee802154_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { if (!netif_running(dev)) goto drop; pr_debug("got frame, type %d, dev %p\n", dev->type, dev); #ifdef DEBUG print_hex_dump_bytes("ieee802154_rcv ", DUMP_PREFIX_NONE, skb->data, skb->len); #endif if (!net_eq(dev_net(dev), &init_net)) goto drop; ieee802154_raw_deliver(dev, skb); if (dev->type != ARPHRD_IEEE802154) goto drop; if (skb->pkt_type != PACKET_OTHERHOST) return ieee802154_dgram_deliver(dev, skb); drop: kfree_skb(skb); return NET_RX_DROP; } static struct packet_type ieee802154_packet_type = { .type = htons(ETH_P_IEEE802154), .func = ieee802154_rcv, }; static int __init af_ieee802154_init(void) { int rc = -EINVAL; rc = proto_register(&ieee802154_raw_prot, 1); if (rc) goto out; rc = proto_register(&ieee802154_dgram_prot, 1); if (rc) goto err_dgram; /* Tell SOCKET that we are alive */ rc = sock_register(&ieee802154_family_ops); if (rc) goto err_sock; dev_add_pack(&ieee802154_packet_type); rc = 0; goto out; err_sock: proto_unregister(&ieee802154_dgram_prot); err_dgram: proto_unregister(&ieee802154_raw_prot); out: return rc; } static void __exit af_ieee802154_remove(void) { dev_remove_pack(&ieee802154_packet_type); sock_unregister(PF_IEEE802154); proto_unregister(&ieee802154_dgram_prot); proto_unregister(&ieee802154_raw_prot); } module_init(af_ieee802154_init); module_exit(af_ieee802154_remove); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_IEEE802154);
9 5 1 4 3 1 3 3 2 1 2 4 2 2 1 1 1 2 9 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 // SPDX-License-Identifier: GPL-2.0 /* * fs/isofs/export.c * * (C) 2004 Paul Serice - The new inode scheme requires switching * from iget() to iget5_locked() which means * the NFS export operations have to be hand * coded because the default routines rely on * iget(). * * The following files are helpful: * * Documentation/filesystems/nfs/Exporting * fs/exportfs/expfs.c. */ #include "isofs.h" static struct dentry * isofs_export_iget(struct super_block *sb, unsigned long block, unsigned long offset, __u32 generation) { struct inode *inode; if (block == 0) return ERR_PTR(-ESTALE); inode = isofs_iget(sb, block, offset); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); return ERR_PTR(-ESTALE); } return d_obtain_alias(inode); } /* This function is surprisingly simple. The trick is understanding * that "child" is always a directory. So, to find its parent, you * simply need to find its ".." entry, normalize its block and offset, * and return the underlying inode. See the comments for * isofs_normalize_block_and_offset(). */ static struct dentry *isofs_export_get_parent(struct dentry *child) { unsigned long parent_block = 0; unsigned long parent_offset = 0; struct inode *child_inode = d_inode(child); struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); struct iso_directory_record *de = NULL; struct buffer_head * bh = NULL; struct dentry *rv = NULL; /* "child" must always be a directory. */ if (!S_ISDIR(child_inode->i_mode)) { printk(KERN_ERR "isofs: isofs_export_get_parent(): " "child is not a directory!\n"); rv = ERR_PTR(-EACCES); goto out; } /* It is an invariant that the directory offset is zero. If * it is not zero, it means the directory failed to be * normalized for some reason. */ if (e_child_inode->i_iget5_offset != 0) { printk(KERN_ERR "isofs: isofs_export_get_parent(): " "child directory not normalized!\n"); rv = ERR_PTR(-EACCES); goto out; } /* The child inode has been normalized such that its * i_iget5_block value points to the "." entry. Fortunately, * the ".." entry is located in the same block. */ parent_block = e_child_inode->i_iget5_block; /* Get the block in question. */ bh = sb_bread(child_inode->i_sb, parent_block); if (bh == NULL) { rv = ERR_PTR(-EACCES); goto out; } /* This is the "." entry. */ de = (struct iso_directory_record*)bh->b_data; /* The ".." entry is always the second entry. */ parent_offset = (unsigned long)isonum_711(de->length); de = (struct iso_directory_record*)(bh->b_data + parent_offset); /* Verify it is in fact the ".." entry. */ if ((isonum_711(de->name_len) != 1) || (de->name[0] != 1)) { printk(KERN_ERR "isofs: Unable to find the \"..\" " "directory for NFS.\n"); rv = ERR_PTR(-EACCES); goto out; } /* Normalize */ isofs_normalize_block_and_offset(de, &parent_block, &parent_offset); rv = d_obtain_alias(isofs_iget(child_inode->i_sb, parent_block, parent_offset)); out: if (bh) brelse(bh); return rv; } static int isofs_export_encode_fh(struct inode *inode, __u32 *fh32, int *max_len, struct inode *parent) { struct iso_inode_info * ei = ISOFS_I(inode); int len = *max_len; int type = 1; __u16 *fh16 = (__u16*)fh32; /* * WARNING: max_len is 5 for NFSv2. Because of this * limitation, we use the lower 16 bits of fh32[1] to hold the * offset of the inode and the upper 16 bits of fh32[1] to * hold the offset of the parent. */ if (parent && (len < 5)) { *max_len = 5; return FILEID_INVALID; } else if (len < 3) { *max_len = 3; return FILEID_INVALID; } len = 3; fh32[0] = ei->i_iget5_block; fh16[2] = (__u16)ei->i_iget5_offset; /* fh16 [sic] */ fh16[3] = 0; /* avoid leaking uninitialized data */ fh32[2] = inode->i_generation; if (parent) { struct iso_inode_info *eparent; eparent = ISOFS_I(parent); fh32[3] = eparent->i_iget5_block; fh16[3] = (__u16)eparent->i_iget5_offset; /* fh16 [sic] */ fh32[4] = parent->i_generation; len = 5; type = 2; } *max_len = len; return type; } struct isofs_fid { u32 block; u16 offset; u16 parent_offset; u32 generation; u32 parent_block; u32 parent_generation; }; static struct dentry *isofs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct isofs_fid *ifid = (struct isofs_fid *)fid; if (fh_len < 3 || fh_type > 2) return NULL; return isofs_export_iget(sb, ifid->block, ifid->offset, ifid->generation); } static struct dentry *isofs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { struct isofs_fid *ifid = (struct isofs_fid *)fid; if (fh_len < 2 || fh_type != 2) return NULL; return isofs_export_iget(sb, fh_len > 2 ? ifid->parent_block : 0, ifid->parent_offset, fh_len > 4 ? ifid->parent_generation : 0); } const struct export_operations isofs_export_ops = { .encode_fh = isofs_export_encode_fh, .fh_to_dentry = isofs_fh_to_dentry, .fh_to_parent = isofs_fh_to_parent, .get_parent = isofs_export_get_parent, };
71 1 70 70 70 70 70 71 8 1 8 2 2 2 2 92 92 92 92 92 1 91 91 92 92 64 71 71 71 71 69 69 71 71 2 2 2 2 9 9 127 127 127 127 124 124 102 124 112 93 93 93 93 43 43 40 96 96 96 96 119 2 22 124 38 124 124 3 3 3 3 3 3 3 3 3 85 92 19 78 7 75 75 74 1 74 75 75 75 75 26 92 26 26 19 11 7 18 17 17 18 18 18 13 13 13 25 13 13 13 85 85 85 85 85 85 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 91 92 1 1 1 1 1 1 98 86 12 98 87 11 98 98 85 81 81 98 2 126 2 126 124 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 /* * Copyright (C) International Business Machines Corp., 2000-2004 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * jfs_imap.c: inode allocation map manager * * Serialization: * Each AG has a simple lock which is used to control the serialization of * the AG level lists. This lock should be taken first whenever an AG * level list will be modified or accessed. * * Each IAG is locked by obtaining the buffer for the IAG page. * * There is also a inode lock for the inode map inode. A read lock needs to * be taken whenever an IAG is read from the map or the global level * information is read. A write lock needs to be taken whenever the global * level information is modified or an atomic operation needs to be used. * * If more than one IAG is read at one time, the read lock may not * be given up until all of the IAG's are read. Otherwise, a deadlock * may occur when trying to obtain the read lock while another thread * holding the read lock is waiting on the IAG already being held. * * The control page of the inode map is read into memory by diMount(). * Thereafter it should only be modified in memory and then it will be * written out when the filesystem is unmounted by diUnmount(). */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/quotaops.h> #include <linux/slab.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" #include "jfs_dinode.h" #include "jfs_dmap.h" #include "jfs_imap.h" #include "jfs_metapage.h" #include "jfs_superblock.h" #include "jfs_debug.h" /* * imap locks */ /* iag free list lock */ #define IAGFREE_LOCK_INIT(imap) mutex_init(&imap->im_freelock) #define IAGFREE_LOCK(imap) mutex_lock(&imap->im_freelock) #define IAGFREE_UNLOCK(imap) mutex_unlock(&imap->im_freelock) /* per ag iag list locks */ #define AG_LOCK_INIT(imap,index) mutex_init(&(imap->im_aglock[index])) #define AG_LOCK(imap,agno) mutex_lock(&imap->im_aglock[agno]) #define AG_UNLOCK(imap,agno) mutex_unlock(&imap->im_aglock[agno]) /* * forward references */ static int diAllocAG(struct inomap *, int, bool, struct inode *); static int diAllocAny(struct inomap *, int, bool, struct inode *); static int diAllocBit(struct inomap *, struct iag *, int); static int diAllocExt(struct inomap *, int, struct inode *); static int diAllocIno(struct inomap *, int, struct inode *); static int diFindFree(u32, int); static int diNewExt(struct inomap *, struct iag *, int); static int diNewIAG(struct inomap *, int *, int, struct metapage **); static void duplicateIXtree(struct super_block *, s64, int, s64 *); static int diIAGRead(struct inomap * imap, int, struct metapage **); static int copy_from_dinode(struct dinode *, struct inode *); static void copy_to_dinode(struct dinode *, struct inode *); /* * NAME: diMount() * * FUNCTION: initialize the incore inode map control structures for * a fileset or aggregate init time. * * the inode map's control structure (dinomap) is * brought in from disk and placed in virtual memory. * * PARAMETERS: * ipimap - pointer to inode map inode for the aggregate or fileset. * * RETURN VALUES: * 0 - success * -ENOMEM - insufficient free virtual memory. * -EIO - i/o error. */ int diMount(struct inode *ipimap) { struct inomap *imap; struct metapage *mp; int index; struct dinomap_disk *dinom_le; /* * allocate/initialize the in-memory inode map control structure */ /* allocate the in-memory inode map control structure. */ imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); if (imap == NULL) { jfs_err("diMount: kmalloc returned NULL!"); return -ENOMEM; } /* read the on-disk inode map control structure. */ mp = read_metapage(ipimap, IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, PSIZE, 0); if (mp == NULL) { kfree(imap); return -EIO; } /* copy the on-disk version to the in-memory version. */ dinom_le = (struct dinomap_disk *) mp->data; imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag); imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag); atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos)); atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree)); imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext); imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext); for (index = 0; index < MAXAG; index++) { imap->im_agctl[index].inofree = le32_to_cpu(dinom_le->in_agctl[index].inofree); imap->im_agctl[index].extfree = le32_to_cpu(dinom_le->in_agctl[index].extfree); imap->im_agctl[index].numinos = le32_to_cpu(dinom_le->in_agctl[index].numinos); imap->im_agctl[index].numfree = le32_to_cpu(dinom_le->in_agctl[index].numfree); } /* release the buffer. */ release_metapage(mp); /* * allocate/initialize inode allocation map locks */ /* allocate and init iag free list lock */ IAGFREE_LOCK_INIT(imap); /* allocate and init ag list locks */ for (index = 0; index < MAXAG; index++) { AG_LOCK_INIT(imap, index); } /* bind the inode map inode and inode map control structure * to each other. */ imap->im_ipimap = ipimap; JFS_IP(ipimap)->i_imap = imap; return (0); } /* * NAME: diUnmount() * * FUNCTION: write to disk the incore inode map control structures for * a fileset or aggregate at unmount time. * * PARAMETERS: * ipimap - pointer to inode map inode for the aggregate or fileset. * * RETURN VALUES: * 0 - success * -ENOMEM - insufficient free virtual memory. * -EIO - i/o error. */ int diUnmount(struct inode *ipimap, int mounterror) { struct inomap *imap = JFS_IP(ipimap)->i_imap; /* * update the on-disk inode map control structure */ if (!(mounterror || isReadOnly(ipimap))) diSync(ipimap); /* * Invalidate the page cache buffers */ truncate_inode_pages(ipimap->i_mapping, 0); /* * free in-memory control structure */ kfree(imap); return (0); } /* * diSync() */ int diSync(struct inode *ipimap) { struct dinomap_disk *dinom_le; struct inomap *imp = JFS_IP(ipimap)->i_imap; struct metapage *mp; int index; /* * write imap global conrol page */ /* read the on-disk inode map control structure */ mp = get_metapage(ipimap, IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, PSIZE, 0); if (mp == NULL) { jfs_err("diSync: get_metapage failed!"); return -EIO; } /* copy the in-memory version to the on-disk version */ dinom_le = (struct dinomap_disk *) mp->data; dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag); dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag); dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos)); dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree)); dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext); dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext); for (index = 0; index < MAXAG; index++) { dinom_le->in_agctl[index].inofree = cpu_to_le32(imp->im_agctl[index].inofree); dinom_le->in_agctl[index].extfree = cpu_to_le32(imp->im_agctl[index].extfree); dinom_le->in_agctl[index].numinos = cpu_to_le32(imp->im_agctl[index].numinos); dinom_le->in_agctl[index].numfree = cpu_to_le32(imp->im_agctl[index].numfree); } /* write out the control structure */ write_metapage(mp); /* * write out dirty pages of imap */ filemap_write_and_wait(ipimap->i_mapping); diWriteSpecial(ipimap, 0); return (0); } /* * NAME: diRead() * * FUNCTION: initialize an incore inode from disk. * * on entry, the specifed incore inode should itself * specify the disk inode number corresponding to the * incore inode (i.e. i_number should be initialized). * * this routine handles incore inode initialization for * both "special" and "regular" inodes. special inodes * are those required early in the mount process and * require special handling since much of the file system * is not yet initialized. these "special" inodes are * identified by a NULL inode map inode pointer and are * actually initialized by a call to diReadSpecial(). * * for regular inodes, the iag describing the disk inode * is read from disk to determine the inode extent address * for the disk inode. with the inode extent address in * hand, the page of the extent that contains the disk * inode is read and the disk inode is copied to the * incore inode. * * PARAMETERS: * ip - pointer to incore inode to be initialized from disk. * * RETURN VALUES: * 0 - success * -EIO - i/o error. * -ENOMEM - insufficient memory * */ int diRead(struct inode *ip) { struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); int iagno, ino, extno, rc; struct inode *ipimap; struct dinode *dp; struct iag *iagp; struct metapage *mp; s64 blkno, agstart; struct inomap *imap; int block_offset; int inodes_left; unsigned long pageno; int rel_inode; jfs_info("diRead: ino = %ld", ip->i_ino); ipimap = sbi->ipimap; JFS_IP(ip)->ipimap = ipimap; /* determine the iag number for this inode (number) */ iagno = INOTOIAG(ip->i_ino); /* read the iag */ imap = JFS_IP(ipimap)->i_imap; IREAD_LOCK(ipimap, RDWRLOCK_IMAP); rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) { jfs_err("diRead: diIAGRead returned %d", rc); return (rc); } iagp = (struct iag *) mp->data; /* determine inode extent that holds the disk inode */ ino = ip->i_ino & (INOSPERIAG - 1); extno = ino >> L2INOSPEREXT; if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) || (addressPXD(&iagp->inoext[extno]) == 0)) { release_metapage(mp); return -ESTALE; } /* get disk block number of the page within the inode extent * that holds the disk inode. */ blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage); /* get the ag for the iag */ agstart = le64_to_cpu(iagp->agstart); release_metapage(mp); rel_inode = (ino & (INOSPERPAGE - 1)); pageno = blkno >> sbi->l2nbperpage; if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { /* * OS/2 didn't always align inode extents on page boundaries */ inodes_left = (sbi->nbperpage - block_offset) << sbi->l2niperblk; if (rel_inode < inodes_left) rel_inode += block_offset << sbi->l2niperblk; else { pageno += 1; rel_inode -= inodes_left; } } /* read the page of disk inode */ mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); if (!mp) { jfs_err("diRead: read_metapage failed"); return -EIO; } /* locate the disk inode requested */ dp = (struct dinode *) mp->data; dp += rel_inode; if (ip->i_ino != le32_to_cpu(dp->di_number)) { jfs_error(ip->i_sb, "i_ino != di_number\n"); rc = -EIO; } else if (le32_to_cpu(dp->di_nlink) == 0) rc = -ESTALE; else /* copy the disk inode to the in-memory inode */ rc = copy_from_dinode(dp, ip); release_metapage(mp); /* set the ag for the inode */ JFS_IP(ip)->agstart = agstart; JFS_IP(ip)->active_ag = -1; return (rc); } /* * NAME: diReadSpecial() * * FUNCTION: initialize a 'special' inode from disk. * * this routines handles aggregate level inodes. The * inode cache cannot differentiate between the * aggregate inodes and the filesystem inodes, so we * handle these here. We don't actually use the aggregate * inode map, since these inodes are at a fixed location * and in some cases the aggregate inode map isn't initialized * yet. * * PARAMETERS: * sb - filesystem superblock * inum - aggregate inode number * secondary - 1 if secondary aggregate inode table * * RETURN VALUES: * new inode - success * NULL - i/o error. */ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) { struct jfs_sb_info *sbi = JFS_SBI(sb); uint address; struct dinode *dp; struct inode *ip; struct metapage *mp; ip = new_inode(sb); if (ip == NULL) { jfs_err("diReadSpecial: new_inode returned NULL!"); return ip; } if (secondary) { address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; JFS_IP(ip)->ipimap = sbi->ipaimap2; } else { address = AITBL_OFF >> L2PSIZE; JFS_IP(ip)->ipimap = sbi->ipaimap; } ASSERT(inum < INOSPEREXT); ip->i_ino = inum; address += inum >> 3; /* 8 inodes per 4K page */ /* read the page of fixed disk inode (AIT) in raw mode */ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); if (mp == NULL) { set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); return (NULL); } /* get the pointer to the disk inode of interest */ dp = (struct dinode *) (mp->data); dp += inum % 8; /* 8 inodes per 4K page */ /* copy on-disk inode to in-memory inode */ if ((copy_from_dinode(dp, ip)) != 0) { /* handle bad return by returning NULL for ip */ set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); /* release the page */ release_metapage(mp); return (NULL); } ip->i_mapping->a_ops = &jfs_metapage_aops; mapping_set_gfp_mask(ip->i_mapping, GFP_NOFS); /* Allocations to metadata inodes should not affect quotas */ ip->i_flags |= S_NOQUOTA; if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) { sbi->gengen = le32_to_cpu(dp->di_gengen); sbi->inostamp = le32_to_cpu(dp->di_inostamp); } /* release the page */ release_metapage(mp); /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ hlist_add_fake(&ip->i_hash); return (ip); } /* * NAME: diWriteSpecial() * * FUNCTION: Write the special inode to disk * * PARAMETERS: * ip - special inode * secondary - 1 if secondary aggregate inode table * * RETURN VALUES: none */ void diWriteSpecial(struct inode *ip, int secondary) { struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); uint address; struct dinode *dp; ino_t inum = ip->i_ino; struct metapage *mp; if (secondary) address = addressPXD(&sbi->ait2) >> sbi->l2nbperpage; else address = AITBL_OFF >> L2PSIZE; ASSERT(inum < INOSPEREXT); address += inum >> 3; /* 8 inodes per 4K page */ /* read the page of fixed disk inode (AIT) in raw mode */ mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); if (mp == NULL) { jfs_err("diWriteSpecial: failed to read aggregate inode extent!"); return; } /* get the pointer to the disk inode of interest */ dp = (struct dinode *) (mp->data); dp += inum % 8; /* 8 inodes per 4K page */ /* copy on-disk inode to in-memory inode */ copy_to_dinode(dp, ip); memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288); if (inum == FILESYSTEM_I) dp->di_gengen = cpu_to_le32(sbi->gengen); /* write the page */ write_metapage(mp); } /* * NAME: diFreeSpecial() * * FUNCTION: Free allocated space for special inode */ void diFreeSpecial(struct inode *ip) { if (ip == NULL) { jfs_err("diFreeSpecial called with NULL ip!"); return; } filemap_write_and_wait(ip->i_mapping); truncate_inode_pages(ip->i_mapping, 0); iput(ip); } /* * NAME: diWrite() * * FUNCTION: write the on-disk inode portion of the in-memory inode * to its corresponding on-disk inode. * * on entry, the specifed incore inode should itself * specify the disk inode number corresponding to the * incore inode (i.e. i_number should be initialized). * * the inode contains the inode extent address for the disk * inode. with the inode extent address in hand, the * page of the extent that contains the disk inode is * read and the disk inode portion of the incore inode * is copied to the disk inode. * * PARAMETERS: * tid - transacation id * ip - pointer to incore inode to be written to the inode extent. * * RETURN VALUES: * 0 - success * -EIO - i/o error. */ int diWrite(tid_t tid, struct inode *ip) { struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); struct jfs_inode_info *jfs_ip = JFS_IP(ip); int rc = 0; s32 ino; struct dinode *dp; s64 blkno; int block_offset; int inodes_left; struct metapage *mp; unsigned long pageno; int rel_inode; int dioffset; struct inode *ipimap; uint type; lid_t lid; struct tlock *ditlck, *tlck; struct linelock *dilinelock, *ilinelock; struct lv *lv; int n; ipimap = jfs_ip->ipimap; ino = ip->i_ino & (INOSPERIAG - 1); if (!addressPXD(&(jfs_ip->ixpxd)) || (lengthPXD(&(jfs_ip->ixpxd)) != JFS_IP(ipimap)->i_imap->im_nbperiext)) { jfs_error(ip->i_sb, "ixpxd invalid\n"); return -EIO; } /* * read the page of disk inode containing the specified inode: */ /* compute the block address of the page */ blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage); rel_inode = (ino & (INOSPERPAGE - 1)); pageno = blkno >> sbi->l2nbperpage; if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { /* * OS/2 didn't always align inode extents on page boundaries */ inodes_left = (sbi->nbperpage - block_offset) << sbi->l2niperblk; if (rel_inode < inodes_left) rel_inode += block_offset << sbi->l2niperblk; else { pageno += 1; rel_inode -= inodes_left; } } /* read the page of disk inode */ retry: mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); if (!mp) return -EIO; /* get the pointer to the disk inode */ dp = (struct dinode *) mp->data; dp += rel_inode; dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE; /* * acquire transaction lock on the on-disk inode; * N.B. tlock is acquired on ipimap not ip; */ if ((ditlck = txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL) goto retry; dilinelock = (struct linelock *) & ditlck->lock; /* * copy btree root from in-memory inode to on-disk inode * * (tlock is taken from inline B+-tree root in in-memory * inode when the B+-tree root is updated, which is pointed * by jfs_ip->blid as well as being on tx tlock list) * * further processing of btree root is based on the copy * in in-memory inode, where txLog() will log from, and, * for xtree root, txUpdateMap() will update map and reset * XAD_NEW bit; */ if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) { /* * This is the special xtree inside the directory for storing * the directory table */ xtpage_t *p, *xp; xad_t *xad; jfs_ip->xtlid = 0; tlck = lid_to_tlock(lid); assert(tlck->type & tlckXTREE); tlck->type |= tlckBTROOT; tlck->mp = mp; ilinelock = (struct linelock *) & tlck->lock; /* * copy xtree root from inode to dinode: */ p = &jfs_ip->i_xtroot; xp = (xtpage_t *) &dp->di_dirtable; lv = ilinelock->lv; for (n = 0; n < ilinelock->index; n++, lv++) { memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], lv->length << L2XTSLOTSIZE); } /* reset on-disk (metadata page) xtree XAD_NEW bit */ xad = &xp->xad[XTENTRYSTART]; for (n = XTENTRYSTART; n < le16_to_cpu(xp->header.nextindex); n++, xad++) if (xad->flag & (XAD_NEW | XAD_EXTENDED)) xad->flag &= ~(XAD_NEW | XAD_EXTENDED); } if ((lid = jfs_ip->blid) == 0) goto inlineData; jfs_ip->blid = 0; tlck = lid_to_tlock(lid); type = tlck->type; tlck->type |= tlckBTROOT; tlck->mp = mp; ilinelock = (struct linelock *) & tlck->lock; /* * regular file: 16 byte (XAD slot) granularity */ if (type & tlckXTREE) { xtpage_t *p, *xp; xad_t *xad; /* * copy xtree root from inode to dinode: */ p = &jfs_ip->i_xtroot; xp = &dp->di_xtroot; lv = ilinelock->lv; for (n = 0; n < ilinelock->index; n++, lv++) { memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], lv->length << L2XTSLOTSIZE); } /* reset on-disk (metadata page) xtree XAD_NEW bit */ xad = &xp->xad[XTENTRYSTART]; for (n = XTENTRYSTART; n < le16_to_cpu(xp->header.nextindex); n++, xad++) if (xad->flag & (XAD_NEW | XAD_EXTENDED)) xad->flag &= ~(XAD_NEW | XAD_EXTENDED); } /* * directory: 32 byte (directory entry slot) granularity */ else if (type & tlckDTREE) { dtpage_t *p, *xp; /* * copy dtree root from inode to dinode: */ p = (dtpage_t *) &jfs_ip->i_dtroot; xp = (dtpage_t *) & dp->di_dtroot; lv = ilinelock->lv; for (n = 0; n < ilinelock->index; n++, lv++) { memcpy(&xp->slot[lv->offset], &p->slot[lv->offset], lv->length << L2DTSLOTSIZE); } } else { jfs_err("diWrite: UFO tlock"); } inlineData: /* * copy inline symlink from in-memory inode to on-disk inode */ if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) { lv = & dilinelock->lv[dilinelock->index]; lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE; lv->length = 2; memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE); dilinelock->index++; } /* * copy inline data from in-memory inode to on-disk inode: * 128 byte slot granularity */ if (test_cflag(COMMIT_Inlineea, ip)) { lv = & dilinelock->lv[dilinelock->index]; lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE; lv->length = 1; memcpy(&dp->di_inlineea, jfs_ip->i_inline_ea, INODESLOTSIZE); dilinelock->index++; clear_cflag(COMMIT_Inlineea, ip); } /* * lock/copy inode base: 128 byte slot granularity */ lv = & dilinelock->lv[dilinelock->index]; lv->offset = dioffset >> L2INODESLOTSIZE; copy_to_dinode(dp, ip); if (test_and_clear_cflag(COMMIT_Dirtable, ip)) { lv->length = 2; memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96); } else lv->length = 1; dilinelock->index++; /* release the buffer holding the updated on-disk inode. * the buffer will be later written by commit processing. */ write_metapage(mp); return (rc); } /* * NAME: diFree(ip) * * FUNCTION: free a specified inode from the inode working map * for a fileset or aggregate. * * if the inode to be freed represents the first (only) * free inode within the iag, the iag will be placed on * the ag free inode list. * * freeing the inode will cause the inode extent to be * freed if the inode is the only allocated inode within * the extent. in this case all the disk resource backing * up the inode extent will be freed. in addition, the iag * will be placed on the ag extent free list if the extent * is the first free extent in the iag. if freeing the * extent also means that no free inodes will exist for * the iag, the iag will also be removed from the ag free * inode list. * * the iag describing the inode will be freed if the extent * is to be freed and it is the only backed extent within * the iag. in this case, the iag will be removed from the * ag free extent list and ag free inode list and placed on * the inode map's free iag list. * * a careful update approach is used to provide consistency * in the face of updates to multiple buffers. under this * approach, all required buffers are obtained before making * any updates and are held until all updates are complete. * * PARAMETERS: * ip - inode to be freed. * * RETURN VALUES: * 0 - success * -EIO - i/o error. */ int diFree(struct inode *ip) { int rc; ino_t inum = ip->i_ino; struct iag *iagp, *aiagp, *biagp, *ciagp, *diagp; struct metapage *mp, *amp, *bmp, *cmp, *dmp; int iagno, ino, extno, bitno, sword, agno; int back, fwd; u32 bitmap, mask; struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap; struct inomap *imap = JFS_IP(ipimap)->i_imap; pxd_t freepxd; tid_t tid; struct inode *iplist[3]; struct tlock *tlck; struct pxd_lock *pxdlock; /* * This is just to suppress compiler warnings. The same logic that * references these variables is used to initialize them. */ aiagp = biagp = ciagp = diagp = NULL; /* get the iag number containing the inode. */ iagno = INOTOIAG(inum); /* make sure that the iag is contained within * the map. */ if (iagno >= imap->im_nextiag) { print_hex_dump(KERN_ERR, "imap: ", DUMP_PREFIX_ADDRESS, 16, 4, imap, 32, 0); jfs_error(ip->i_sb, "inum = %d, iagno = %d, nextiag = %d\n", (uint) inum, iagno, imap->im_nextiag); return -EIO; } /* get the allocation group for this ino. */ agno = BLKTOAG(JFS_IP(ip)->agstart, JFS_SBI(ip->i_sb)); /* Lock the AG specific inode map information */ AG_LOCK(imap, agno); /* Obtain read lock in imap inode. Don't release it until we have * read all of the IAG's that we are going to. */ IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* read the iag. */ if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); return (rc); } iagp = (struct iag *) mp->data; /* get the inode number and extent number of the inode within * the iag and the inode number within the extent. */ ino = inum & (INOSPERIAG - 1); extno = ino >> L2INOSPEREXT; bitno = ino & (INOSPEREXT - 1); mask = HIGHORDER >> bitno; if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { jfs_error(ip->i_sb, "wmap shows inode already free\n"); } if (!addressPXD(&iagp->inoext[extno])) { release_metapage(mp); IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); jfs_error(ip->i_sb, "invalid inoext\n"); return -EIO; } /* compute the bitmap for the extent reflecting the freed inode. */ bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask; if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) { release_metapage(mp); IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); jfs_error(ip->i_sb, "numfree > numinos\n"); return -EIO; } /* * inode extent still has some inodes or below low water mark: * keep the inode extent; */ if (bitmap || imap->im_agctl[agno].numfree < 96 || (imap->im_agctl[agno].numfree < 288 && (((imap->im_agctl[agno].numfree * 100) / imap->im_agctl[agno].numinos) <= 25))) { /* if the iag currently has no free inodes (i.e., * the inode being freed is the first free inode of iag), * insert the iag at head of the inode free list for the ag. */ if (iagp->nfreeinos == 0) { /* check if there are any iags on the ag inode * free list. if so, read the first one so that * we can link the current iag onto the list at * the head. */ if ((fwd = imap->im_agctl[agno].inofree) >= 0) { /* read the iag that currently is the head * of the list. */ if ((rc = diIAGRead(imap, fwd, &amp))) { IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); release_metapage(mp); return (rc); } aiagp = (struct iag *) amp->data; /* make current head point back to the iag. */ aiagp->inofreeback = cpu_to_le32(iagno); write_metapage(amp); } /* iag points forward to current head and iag * becomes the new head of the list. */ iagp->inofreefwd = cpu_to_le32(imap->im_agctl[agno].inofree); iagp->inofreeback = cpu_to_le32(-1); imap->im_agctl[agno].inofree = iagno; } IREAD_UNLOCK(ipimap); /* update the free inode summary map for the extent if * freeing the inode means the extent will now have free * inodes (i.e., the inode being freed is the first free * inode of extent), */ if (iagp->wmap[extno] == cpu_to_le32(ONES)) { sword = extno >> L2EXTSPERSUM; bitno = extno & (EXTSPERSUM - 1); iagp->inosmap[sword] &= cpu_to_le32(~(HIGHORDER >> bitno)); } /* update the bitmap. */ iagp->wmap[extno] = cpu_to_le32(bitmap); /* update the free inode counts at the iag, ag and * map level. */ le32_add_cpu(&iagp->nfreeinos, 1); imap->im_agctl[agno].numfree += 1; atomic_inc(&imap->im_numfree); /* release the AG inode map lock */ AG_UNLOCK(imap, agno); /* write the iag */ write_metapage(mp); return (0); } /* * inode extent has become free and above low water mark: * free the inode extent; */ /* * prepare to update iag list(s) (careful update step 1) */ amp = bmp = cmp = dmp = NULL; fwd = back = -1; /* check if the iag currently has no free extents. if so, * it will be placed on the head of the ag extent free list. */ if (iagp->nfreeexts == 0) { /* check if the ag extent free list has any iags. * if so, read the iag at the head of the list now. * this (head) iag will be updated later to reflect * the addition of the current iag at the head of * the list. */ if ((fwd = imap->im_agctl[agno].extfree) >= 0) { if ((rc = diIAGRead(imap, fwd, &amp))) goto error_out; aiagp = (struct iag *) amp->data; } } else { /* iag has free extents. check if the addition of a free * extent will cause all extents to be free within this * iag. if so, the iag will be removed from the ag extent * free list and placed on the inode map's free iag list. */ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { /* in preparation for removing the iag from the * ag extent free list, read the iags preceding * and following the iag on the ag extent free * list. */ if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { if ((rc = diIAGRead(imap, fwd, &amp))) goto error_out; aiagp = (struct iag *) amp->data; } if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { if ((rc = diIAGRead(imap, back, &bmp))) goto error_out; biagp = (struct iag *) bmp->data; } } } /* remove the iag from the ag inode free list if freeing * this extent cause the iag to have no free inodes. */ if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { int inofreeback = le32_to_cpu(iagp->inofreeback); int inofreefwd = le32_to_cpu(iagp->inofreefwd); /* in preparation for removing the iag from the * ag inode free list, read the iags preceding * and following the iag on the ag inode free * list. before reading these iags, we must make * sure that we already don't have them in hand * from up above, since re-reading an iag (buffer) * we are currently holding would cause a deadlock. */ if (inofreefwd >= 0) { if (inofreefwd == fwd) ciagp = (struct iag *) amp->data; else if (inofreefwd == back) ciagp = (struct iag *) bmp->data; else { if ((rc = diIAGRead(imap, inofreefwd, &cmp))) goto error_out; ciagp = (struct iag *) cmp->data; } assert(ciagp != NULL); } if (inofreeback >= 0) { if (inofreeback == fwd) diagp = (struct iag *) amp->data; else if (inofreeback == back) diagp = (struct iag *) bmp->data; else { if ((rc = diIAGRead(imap, inofreeback, &dmp))) goto error_out; diagp = (struct iag *) dmp->data; } assert(diagp != NULL); } } IREAD_UNLOCK(ipimap); /* * invalidate any page of the inode extent freed from buffer cache; */ freepxd = iagp->inoext[extno]; invalidate_pxd_metapages(ip, freepxd); /* * update iag list(s) (careful update step 2) */ /* add the iag to the ag extent free list if this is the * first free extent for the iag. */ if (iagp->nfreeexts == 0) { if (fwd >= 0) aiagp->extfreeback = cpu_to_le32(iagno); iagp->extfreefwd = cpu_to_le32(imap->im_agctl[agno].extfree); iagp->extfreeback = cpu_to_le32(-1); imap->im_agctl[agno].extfree = iagno; } else { /* remove the iag from the ag extent list if all extents * are now free and place it on the inode map iag free list. */ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { if (fwd >= 0) aiagp->extfreeback = iagp->extfreeback; if (back >= 0) biagp->extfreefwd = iagp->extfreefwd; else imap->im_agctl[agno].extfree = le32_to_cpu(iagp->extfreefwd); iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); IAGFREE_LOCK(imap); iagp->iagfree = cpu_to_le32(imap->im_freeiag); imap->im_freeiag = iagno; IAGFREE_UNLOCK(imap); } } /* remove the iag from the ag inode free list if freeing * this extent causes the iag to have no free inodes. */ if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) ciagp->inofreeback = iagp->inofreeback; if ((int) le32_to_cpu(iagp->inofreeback) >= 0) diagp->inofreefwd = iagp->inofreefwd; else imap->im_agctl[agno].inofree = le32_to_cpu(iagp->inofreefwd); iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); } /* update the inode extent address and working map * to reflect the free extent. * the permanent map should have been updated already * for the inode being freed. */ if (iagp->pmap[extno] != 0) { jfs_error(ip->i_sb, "the pmap does not show inode free\n"); } iagp->wmap[extno] = 0; PXDlength(&iagp->inoext[extno], 0); PXDaddress(&iagp->inoext[extno], 0); /* update the free extent and free inode summary maps * to reflect the freed extent. * the inode summary map is marked to indicate no inodes * available for the freed extent. */ sword = extno >> L2EXTSPERSUM; bitno = extno & (EXTSPERSUM - 1); mask = HIGHORDER >> bitno; iagp->inosmap[sword] |= cpu_to_le32(mask); iagp->extsmap[sword] &= cpu_to_le32(~mask); /* update the number of free inodes and number of free extents * for the iag. */ le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1)); le32_add_cpu(&iagp->nfreeexts, 1); /* update the number of free inodes and backed inodes * at the ag and inode map level. */ imap->im_agctl[agno].numfree -= (INOSPEREXT - 1); imap->im_agctl[agno].numinos -= INOSPEREXT; atomic_sub(INOSPEREXT - 1, &imap->im_numfree); atomic_sub(INOSPEREXT, &imap->im_numinos); if (amp) write_metapage(amp); if (bmp) write_metapage(bmp); if (cmp) write_metapage(cmp); if (dmp) write_metapage(dmp); /* * start transaction to update block allocation map * for the inode extent freed; * * N.B. AG_LOCK is released and iag will be released below, and * other thread may allocate inode from/reusing the ixad freed * BUT with new/different backing inode extent from the extent * to be freed by the transaction; */ tid = txBegin(ipimap->i_sb, COMMIT_FORCE); mutex_lock(&JFS_IP(ipimap)->commit_mutex); /* acquire tlock of the iag page of the freed ixad * to force the page NOHOMEOK (even though no data is * logged from the iag page) until NOREDOPAGE|FREEXTENT log * for the free of the extent is committed; * write FREEXTENT|NOREDOPAGE log record * N.B. linelock is overlaid as freed extent descriptor; */ tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE); pxdlock = (struct pxd_lock *) & tlck->lock; pxdlock->flag = mlckFREEPXD; pxdlock->pxd = freepxd; pxdlock->index = 1; write_metapage(mp); iplist[0] = ipimap; /* * logredo needs the IAG number and IAG extent index in order * to ensure that the IMap is consistent. The least disruptive * way to pass these values through to the transaction manager * is in the iplist array. * * It's not pretty, but it works. */ iplist[1] = (struct inode *) (size_t)iagno; iplist[2] = (struct inode *) (size_t)extno; rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); txEnd(tid); mutex_unlock(&JFS_IP(ipimap)->commit_mutex); /* unlock the AG inode map information */ AG_UNLOCK(imap, agno); return (0); error_out: IREAD_UNLOCK(ipimap); if (amp) release_metapage(amp); if (bmp) release_metapage(bmp); if (cmp) release_metapage(cmp); if (dmp) release_metapage(dmp); AG_UNLOCK(imap, agno); release_metapage(mp); return (rc); } /* * There are several places in the diAlloc* routines where we initialize * the inode. */ static inline void diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); ip->i_ino = (iagno << L2INOSPERIAG) + ino; jfs_ip->ixpxd = iagp->inoext[extno]; jfs_ip->agstart = le64_to_cpu(iagp->agstart); jfs_ip->active_ag = -1; } /* * NAME: diAlloc(pip,dir,ip) * * FUNCTION: allocate a disk inode from the inode working map * for a fileset or aggregate. * * PARAMETERS: * pip - pointer to incore inode for the parent inode. * dir - 'true' if the new disk inode is for a directory. * ip - pointer to a new inode * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. * -EIO - i/o error. */ int diAlloc(struct inode *pip, bool dir, struct inode *ip) { int rc, ino, iagno, addext, extno, bitno, sword; int nwords, rem, i, agno; u32 mask, inosmap, extsmap; struct inode *ipimap; struct metapage *mp; ino_t inum; struct iag *iagp; struct inomap *imap; /* get the pointers to the inode map inode and the * corresponding imap control structure. */ ipimap = JFS_SBI(pip->i_sb)->ipimap; imap = JFS_IP(ipimap)->i_imap; JFS_IP(ip)->ipimap = ipimap; JFS_IP(ip)->fileset = FILESYSTEM_I; /* for a directory, the allocation policy is to start * at the ag level using the preferred ag. */ if (dir) { agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); AG_LOCK(imap, agno); goto tryag; } /* for files, the policy starts off by trying to allocate from * the same iag containing the parent disk inode: * try to allocate the new disk inode close to the parent disk * inode, using parent disk inode number + 1 as the allocation * hint. (we use a left-to-right policy to attempt to avoid * moving backward on the disk.) compute the hint within the * file system and the iag. */ /* get the ag number of this iag */ agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb)); if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) { /* * There is an open file actively growing. We want to * allocate new inodes from a different ag to avoid * fragmentation problems. */ agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); AG_LOCK(imap, agno); goto tryag; } inum = pip->i_ino + 1; ino = inum & (INOSPERIAG - 1); /* back off the hint if it is outside of the iag */ if (ino == 0) inum = pip->i_ino; /* lock the AG inode map information */ AG_LOCK(imap, agno); /* Get read lock on imap inode */ IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* get the iag number and read the iag */ iagno = INOTOIAG(inum); if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); return (rc); } iagp = (struct iag *) mp->data; /* determine if new inode extent is allowed to be added to the iag. * new inode extent can be added to the iag if the ag * has less than 32 free disk inodes and the iag has free extents. */ addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); /* * try to allocate from the IAG */ /* check if the inode may be allocated from the iag * (i.e. the inode has free inodes or new extent can be added). */ if (iagp->nfreeinos || addext) { /* determine the extent number of the hint. */ extno = ino >> L2INOSPEREXT; /* check if the extent containing the hint has backed * inodes. if so, try to allocate within this extent. */ if (addressPXD(&iagp->inoext[extno])) { bitno = ino & (INOSPEREXT - 1); if ((bitno = diFindFree(le32_to_cpu(iagp->wmap[extno]), bitno)) < INOSPEREXT) { ino = (extno << L2INOSPEREXT) + bitno; /* a free inode (bit) was found within this * extent, so allocate it. */ rc = diAllocBit(imap, iagp, ino); IREAD_UNLOCK(ipimap); if (rc) { assert(rc == -EIO); } else { /* set the results of the allocation * and write the iag. */ diInitInode(ip, iagno, ino, extno, iagp); mark_metapage_dirty(mp); } release_metapage(mp); /* free the AG lock and return. */ AG_UNLOCK(imap, agno); return (rc); } if (!addext) extno = (extno == EXTSPERIAG - 1) ? 0 : extno + 1; } /* * no free inodes within the extent containing the hint. * * try to allocate from the backed extents following * hint or, if appropriate (i.e. addext is true), allocate * an extent of free inodes at or following the extent * containing the hint. * * the free inode and free extent summary maps are used * here, so determine the starting summary map position * and the number of words we'll have to examine. again, * the approach is to allocate following the hint, so we * might have to initially ignore prior bits of the summary * map that represent extents prior to the extent containing * the hint and later revisit these bits. */ bitno = extno & (EXTSPERSUM - 1); nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1; sword = extno >> L2EXTSPERSUM; /* mask any prior bits for the starting words of the * summary map. */ mask = (bitno == 0) ? 0 : (ONES << (EXTSPERSUM - bitno)); inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask; extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask; /* scan the free inode and free extent summary maps for * free resources. */ for (i = 0; i < nwords; i++) { /* check if this word of the free inode summary * map describes an extent with free inodes. */ if (~inosmap) { /* an extent with free inodes has been * found. determine the extent number * and the inode number within the extent. */ rem = diFindFree(inosmap, 0); extno = (sword << L2EXTSPERSUM) + rem; rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0); if (rem >= INOSPEREXT) { IREAD_UNLOCK(ipimap); release_metapage(mp); AG_UNLOCK(imap, agno); jfs_error(ip->i_sb, "can't find free bit in wmap\n"); return -EIO; } /* determine the inode number within the * iag and allocate the inode from the * map. */ ino = (extno << L2INOSPEREXT) + rem; rc = diAllocBit(imap, iagp, ino); IREAD_UNLOCK(ipimap); if (rc) assert(rc == -EIO); else { /* set the results of the allocation * and write the iag. */ diInitInode(ip, iagno, ino, extno, iagp); mark_metapage_dirty(mp); } release_metapage(mp); /* free the AG lock and return. */ AG_UNLOCK(imap, agno); return (rc); } /* check if we may allocate an extent of free * inodes and whether this word of the free * extents summary map describes a free extent. */ if (addext && ~extsmap) { /* a free extent has been found. determine * the extent number. */ rem = diFindFree(extsmap, 0); extno = (sword << L2EXTSPERSUM) + rem; /* allocate an extent of free inodes. */ if ((rc = diNewExt(imap, iagp, extno))) { /* if there is no disk space for a * new extent, try to allocate the * disk inode from somewhere else. */ if (rc == -ENOSPC) break; assert(rc == -EIO); } else { /* set the results of the allocation * and write the iag. */ diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp); mark_metapage_dirty(mp); } release_metapage(mp); /* free the imap inode & the AG lock & return. */ IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); return (rc); } /* move on to the next set of summary map words. */ sword = (sword == SMAPSZ - 1) ? 0 : sword + 1; inosmap = le32_to_cpu(iagp->inosmap[sword]); extsmap = le32_to_cpu(iagp->extsmap[sword]); } } /* unlock imap inode */ IREAD_UNLOCK(ipimap); /* nothing doing in this iag, so release it. */ release_metapage(mp); tryag: /* * try to allocate anywhere within the same AG as the parent inode. */ rc = diAllocAG(imap, agno, dir, ip); AG_UNLOCK(imap, agno); if (rc != -ENOSPC) return (rc); /* * try to allocate in any AG. */ return (diAllocAny(imap, agno, dir, ip)); } /* * NAME: diAllocAG(imap,agno,dir,ip) * * FUNCTION: allocate a disk inode from the allocation group. * * this routine first determines if a new extent of free * inodes should be added for the allocation group, with * the current request satisfied from this extent. if this * is the case, an attempt will be made to do just that. if * this attempt fails or it has been determined that a new * extent should not be added, an attempt is made to satisfy * the request by allocating an existing (backed) free inode * from the allocation group. * * PRE CONDITION: Already have the AG lock for this AG. * * PARAMETERS: * imap - pointer to inode map control structure. * agno - allocation group to allocate from. * dir - 'true' if the new disk inode is for a directory. * ip - pointer to the new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. * -EIO - i/o error. */ static int diAllocAG(struct inomap * imap, int agno, bool dir, struct inode *ip) { int rc, addext, numfree, numinos; /* get the number of free and the number of backed disk * inodes currently within the ag. */ numfree = imap->im_agctl[agno].numfree; numinos = imap->im_agctl[agno].numinos; if (numfree > numinos) { jfs_error(ip->i_sb, "numfree > numinos\n"); return -EIO; } /* determine if we should allocate a new extent of free inodes * within the ag: for directory inodes, add a new extent * if there are a small number of free inodes or number of free * inodes is a small percentage of the number of backed inodes. */ if (dir) addext = (numfree < 64 || (numfree < 256 && ((numfree * 100) / numinos) <= 20)); else addext = (numfree == 0); /* * try to allocate a new extent of free inodes. */ if (addext) { /* if free space is not available for this new extent, try * below to allocate a free and existing (already backed) * inode from the ag. */ if ((rc = diAllocExt(imap, agno, ip)) != -ENOSPC) return (rc); } /* * try to allocate an existing free inode from the ag. */ return (diAllocIno(imap, agno, ip)); } /* * NAME: diAllocAny(imap,agno,dir,iap) * * FUNCTION: allocate a disk inode from any other allocation group. * * this routine is called when an allocation attempt within * the primary allocation group has failed. if attempts to * allocate an inode from any allocation group other than the * specified primary group. * * PARAMETERS: * imap - pointer to inode map control structure. * agno - primary allocation group (to avoid). * dir - 'true' if the new disk inode is for a directory. * ip - pointer to a new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. * -EIO - i/o error. */ static int diAllocAny(struct inomap * imap, int agno, bool dir, struct inode *ip) { int ag, rc; int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag; /* try to allocate from the ags following agno up to * the maximum ag number. */ for (ag = agno + 1; ag <= maxag; ag++) { AG_LOCK(imap, ag); rc = diAllocAG(imap, ag, dir, ip); AG_UNLOCK(imap, ag); if (rc != -ENOSPC) return (rc); } /* try to allocate from the ags in front of agno. */ for (ag = 0; ag < agno; ag++) { AG_LOCK(imap, ag); rc = diAllocAG(imap, ag, dir, ip); AG_UNLOCK(imap, ag); if (rc != -ENOSPC) return (rc); } /* no free disk inodes. */ return -ENOSPC; } /* * NAME: diAllocIno(imap,agno,ip) * * FUNCTION: allocate a disk inode from the allocation group's free * inode list, returning an error if this free list is * empty (i.e. no iags on the list). * * allocation occurs from the first iag on the list using * the iag's free inode summary map to find the leftmost * free inode in the iag. * * PRE CONDITION: Already have AG lock for this AG. * * PARAMETERS: * imap - pointer to inode map control structure. * agno - allocation group. * ip - pointer to new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. * -EIO - i/o error. */ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) { int iagno, ino, rc, rem, extno, sword; struct metapage *mp; struct iag *iagp; /* check if there are iags on the ag's free inode list. */ if ((iagno = imap->im_agctl[agno].inofree) < 0) return -ENOSPC; /* obtain read lock on imap inode */ IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); /* read the iag at the head of the list. */ if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(imap->im_ipimap); return (rc); } iagp = (struct iag *) mp->data; /* better be free inodes in this iag if it is on the * list. */ if (!iagp->nfreeinos) { IREAD_UNLOCK(imap->im_ipimap); release_metapage(mp); jfs_error(ip->i_sb, "nfreeinos = 0, but iag on freelist\n"); return -EIO; } /* scan the free inode summary map to find an extent * with free inodes. */ for (sword = 0;; sword++) { if (sword >= SMAPSZ) { IREAD_UNLOCK(imap->im_ipimap); release_metapage(mp); jfs_error(ip->i_sb, "free inode not found in summary map\n"); return -EIO; } if (~iagp->inosmap[sword]) break; } /* found a extent with free inodes. determine * the extent number. */ rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0); if (rem >= EXTSPERSUM) { IREAD_UNLOCK(imap->im_ipimap); release_metapage(mp); jfs_error(ip->i_sb, "no free extent found\n"); return -EIO; } extno = (sword << L2EXTSPERSUM) + rem; /* find the first free inode in the extent. */ rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0); if (rem >= INOSPEREXT) { IREAD_UNLOCK(imap->im_ipimap); release_metapage(mp); jfs_error(ip->i_sb, "free inode not found\n"); return -EIO; } /* compute the inode number within the iag. */ ino = (extno << L2INOSPEREXT) + rem; /* allocate the inode. */ rc = diAllocBit(imap, iagp, ino); IREAD_UNLOCK(imap->im_ipimap); if (rc) { release_metapage(mp); return (rc); } /* set the results of the allocation and write the iag. */ diInitInode(ip, iagno, ino, extno, iagp); write_metapage(mp); return (0); } /* * NAME: diAllocExt(imap,agno,ip) * * FUNCTION: add a new extent of free inodes to an iag, allocating * an inode from this extent to satisfy the current allocation * request. * * this routine first tries to find an existing iag with free * extents through the ag free extent list. if list is not * empty, the head of the list will be selected as the home * of the new extent of free inodes. otherwise (the list is * empty), a new iag will be allocated for the ag to contain * the extent. * * once an iag has been selected, the free extent summary map * is used to locate a free extent within the iag and diNewExt() * is called to initialize the extent, with initialization * including the allocation of the first inode of the extent * for the purpose of satisfying this request. * * PARAMETERS: * imap - pointer to inode map control structure. * agno - allocation group number. * ip - pointer to new inode to be filled in on successful return * with the disk inode number allocated, its extent address * and the start of the ag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. * -EIO - i/o error. */ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) { int rem, iagno, sword, extno, rc; struct metapage *mp; struct iag *iagp; /* check if the ag has any iags with free extents. if not, * allocate a new iag for the ag. */ if ((iagno = imap->im_agctl[agno].extfree) < 0) { /* If successful, diNewIAG will obtain the read lock on the * imap inode. */ if ((rc = diNewIAG(imap, &iagno, agno, &mp))) { return (rc); } iagp = (struct iag *) mp->data; /* set the ag number if this a brand new iag */ iagp->agstart = cpu_to_le64(AGTOBLK(agno, imap->im_ipimap)); } else { /* read the iag. */ IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(imap->im_ipimap); jfs_error(ip->i_sb, "error reading iag\n"); return rc; } iagp = (struct iag *) mp->data; } /* using the free extent summary map, find a free extent. */ for (sword = 0;; sword++) { if (sword >= SMAPSZ) { release_metapage(mp); IREAD_UNLOCK(imap->im_ipimap); jfs_error(ip->i_sb, "free ext summary map not found\n"); return -EIO; } if (~iagp->extsmap[sword]) break; } /* determine the extent number of the free extent. */ rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0); if (rem >= EXTSPERSUM) { release_metapage(mp); IREAD_UNLOCK(imap->im_ipimap); jfs_error(ip->i_sb, "free extent not found\n"); return -EIO; } extno = (sword << L2EXTSPERSUM) + rem; /* initialize the new extent. */ rc = diNewExt(imap, iagp, extno); IREAD_UNLOCK(imap->im_ipimap); if (rc) { /* something bad happened. if a new iag was allocated, * place it back on the inode map's iag free list, and * clear the ag number information. */ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { IAGFREE_LOCK(imap); iagp->iagfree = cpu_to_le32(imap->im_freeiag); imap->im_freeiag = iagno; IAGFREE_UNLOCK(imap); } write_metapage(mp); return (rc); } /* set the results of the allocation and write the iag. */ diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp); write_metapage(mp); return (0); } /* * NAME: diAllocBit(imap,iagp,ino) * * FUNCTION: allocate a backed inode from an iag. * * this routine performs the mechanics of allocating a * specified inode from a backed extent. * * if the inode to be allocated represents the last free * inode within the iag, the iag will be removed from the * ag free inode list. * * a careful update approach is used to provide consistency * in the face of updates to multiple buffers. under this * approach, all required buffers are obtained before making * any updates and are held all are updates are complete. * * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on * this AG. Must have read lock on imap inode. * * PARAMETERS: * imap - pointer to inode map control structure. * iagp - pointer to iag. * ino - inode number to be allocated within the iag. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. * -EIO - i/o error. */ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino) { int extno, bitno, agno, sword, rc; struct metapage *amp = NULL, *bmp = NULL; struct iag *aiagp = NULL, *biagp = NULL; u32 mask; /* check if this is the last free inode within the iag. * if so, it will have to be removed from the ag free * inode list, so get the iags preceding and following * it on the list. */ if (iagp->nfreeinos == cpu_to_le32(1)) { if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) { if ((rc = diIAGRead(imap, le32_to_cpu(iagp->inofreefwd), &amp))) return (rc); aiagp = (struct iag *) amp->data; } if ((int) le32_to_cpu(iagp->inofreeback) >= 0) { if ((rc = diIAGRead(imap, le32_to_cpu(iagp->inofreeback), &bmp))) { if (amp) release_metapage(amp); return (rc); } biagp = (struct iag *) bmp->data; } } /* get the ag number, extent number, inode number within * the extent. */ agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb)); extno = ino >> L2INOSPEREXT; bitno = ino & (INOSPEREXT - 1); /* compute the mask for setting the map. */ mask = HIGHORDER >> bitno; /* the inode should be free and backed. */ if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) || ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) || (addressPXD(&iagp->inoext[extno]) == 0)) { if (amp) release_metapage(amp); if (bmp) release_metapage(bmp); jfs_error(imap->im_ipimap->i_sb, "iag inconsistent\n"); return -EIO; } /* mark the inode as allocated in the working map. */ iagp->wmap[extno] |= cpu_to_le32(mask); /* check if all inodes within the extent are now * allocated. if so, update the free inode summary * map to reflect this. */ if (iagp->wmap[extno] == cpu_to_le32(ONES)) { sword = extno >> L2EXTSPERSUM; bitno = extno & (EXTSPERSUM - 1); iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno); } /* if this was the last free inode in the iag, remove the * iag from the ag free inode list. */ if (iagp->nfreeinos == cpu_to_le32(1)) { if (amp) { aiagp->inofreeback = iagp->inofreeback; write_metapage(amp); } if (bmp) { biagp->inofreefwd = iagp->inofreefwd; write_metapage(bmp); } else { imap->im_agctl[agno].inofree = le32_to_cpu(iagp->inofreefwd); } iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); } /* update the free inode count at the iag, ag, inode * map levels. */ le32_add_cpu(&iagp->nfreeinos, -1); imap->im_agctl[agno].numfree -= 1; atomic_dec(&imap->im_numfree); return (0); } /* * NAME: diNewExt(imap,iagp,extno) * * FUNCTION: initialize a new extent of inodes for an iag, allocating * the first inode of the extent for use for the current * allocation request. * * disk resources are allocated for the new extent of inodes * and the inodes themselves are initialized to reflect their * existence within the extent (i.e. their inode numbers and * inode extent addresses are set) and their initial state * (mode and link count are set to zero). * * if the iag is new, it is not yet on an ag extent free list * but will now be placed on this list. * * if the allocation of the new extent causes the iag to * have no free extent, the iag will be removed from the * ag extent free list. * * if the iag has no free backed inodes, it will be placed * on the ag free inode list, since the addition of the new * extent will now cause it to have free inodes. * * a careful update approach is used to provide consistency * (i.e. list consistency) in the face of updates to multiple * buffers. under this approach, all required buffers are * obtained before making any updates and are held until all * updates are complete. * * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on * this AG. Must have read lock on imap inode. * * PARAMETERS: * imap - pointer to inode map control structure. * iagp - pointer to iag. * extno - extent number. * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. * -EIO - i/o error. */ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno) { int agno, iagno, fwd, back, freei = 0, sword, rc; struct iag *aiagp = NULL, *biagp = NULL, *ciagp = NULL; struct metapage *amp, *bmp, *cmp, *dmp; struct inode *ipimap; s64 blkno, hint; int i, j; u32 mask; ino_t ino; struct dinode *dp; struct jfs_sb_info *sbi; /* better have free extents. */ if (!iagp->nfreeexts) { jfs_error(imap->im_ipimap->i_sb, "no free extents\n"); return -EIO; } /* get the inode map inode. */ ipimap = imap->im_ipimap; sbi = JFS_SBI(ipimap->i_sb); amp = bmp = cmp = NULL; /* get the ag and iag numbers for this iag. */ agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); iagno = le32_to_cpu(iagp->iagnum); /* check if this is the last free extent within the * iag. if so, the iag must be removed from the ag * free extent list, so get the iags preceding and * following the iag on this list. */ if (iagp->nfreeexts == cpu_to_le32(1)) { if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { if ((rc = diIAGRead(imap, fwd, &amp))) return (rc); aiagp = (struct iag *) amp->data; } if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { if ((rc = diIAGRead(imap, back, &bmp))) goto error_out; biagp = (struct iag *) bmp->data; } } else { /* the iag has free extents. if all extents are free * (as is the case for a newly allocated iag), the iag * must be added to the ag free extent list, so get * the iag at the head of the list in preparation for * adding this iag to this list. */ fwd = back = -1; if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { if ((fwd = imap->im_agctl[agno].extfree) >= 0) { if ((rc = diIAGRead(imap, fwd, &amp))) goto error_out; aiagp = (struct iag *) amp->data; } } } /* check if the iag has no free inodes. if so, the iag * will have to be added to the ag free inode list, so get * the iag at the head of the list in preparation for * adding this iag to this list. in doing this, we must * check if we already have the iag at the head of * the list in hand. */ if (iagp->nfreeinos == 0) { freei = imap->im_agctl[agno].inofree; if (freei >= 0) { if (freei == fwd) { ciagp = aiagp; } else if (freei == back) { ciagp = biagp; } else { if ((rc = diIAGRead(imap, freei, &cmp))) goto error_out; ciagp = (struct iag *) cmp->data; } if (ciagp == NULL) { jfs_error(imap->im_ipimap->i_sb, "ciagp == NULL\n"); rc = -EIO; goto error_out; } } } /* allocate disk space for the inode extent. */ if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0)) hint = ((s64) agno << sbi->bmap->db_agl2size) - 1; else hint = addressPXD(&iagp->inoext[extno - 1]) + lengthPXD(&iagp->inoext[extno - 1]) - 1; if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno))) goto error_out; /* compute the inode number of the first inode within the * extent. */ ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT); /* initialize the inodes within the newly allocated extent a * page at a time. */ for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) { /* get a buffer for this page of disk inodes. */ dmp = get_metapage(ipimap, blkno + i, PSIZE, 1); if (dmp == NULL) { rc = -EIO; goto error_out; } dp = (struct dinode *) dmp->data; /* initialize the inode number, mode, link count and * inode extent address. */ for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) { dp->di_inostamp = cpu_to_le32(sbi->inostamp); dp->di_number = cpu_to_le32(ino); dp->di_fileset = cpu_to_le32(FILESYSTEM_I); dp->di_mode = 0; dp->di_nlink = 0; PXDaddress(&(dp->di_ixpxd), blkno); PXDlength(&(dp->di_ixpxd), imap->im_nbperiext); } write_metapage(dmp); } /* if this is the last free extent within the iag, remove the * iag from the ag free extent list. */ if (iagp->nfreeexts == cpu_to_le32(1)) { if (fwd >= 0) aiagp->extfreeback = iagp->extfreeback; if (back >= 0) biagp->extfreefwd = iagp->extfreefwd; else imap->im_agctl[agno].extfree = le32_to_cpu(iagp->extfreefwd); iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); } else { /* if the iag has all free extents (newly allocated iag), * add the iag to the ag free extent list. */ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { if (fwd >= 0) aiagp->extfreeback = cpu_to_le32(iagno); iagp->extfreefwd = cpu_to_le32(fwd); iagp->extfreeback = cpu_to_le32(-1); imap->im_agctl[agno].extfree = iagno; } } /* if the iag has no free inodes, add the iag to the * ag free inode list. */ if (iagp->nfreeinos == 0) { if (freei >= 0) ciagp->inofreeback = cpu_to_le32(iagno); iagp->inofreefwd = cpu_to_le32(imap->im_agctl[agno].inofree); iagp->inofreeback = cpu_to_le32(-1); imap->im_agctl[agno].inofree = iagno; } /* initialize the extent descriptor of the extent. */ PXDlength(&iagp->inoext[extno], imap->im_nbperiext); PXDaddress(&iagp->inoext[extno], blkno); /* initialize the working and persistent map of the extent. * the working map will be initialized such that * it indicates the first inode of the extent is allocated. */ iagp->wmap[extno] = cpu_to_le32(HIGHORDER); iagp->pmap[extno] = 0; /* update the free inode and free extent summary maps * for the extent to indicate the extent has free inodes * and no longer represents a free extent. */ sword = extno >> L2EXTSPERSUM; mask = HIGHORDER >> (extno & (EXTSPERSUM - 1)); iagp->extsmap[sword] |= cpu_to_le32(mask); iagp->inosmap[sword] &= cpu_to_le32(~mask); /* update the free inode and free extent counts for the * iag. */ le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1)); le32_add_cpu(&iagp->nfreeexts, -1); /* update the free and backed inode counts for the ag. */ imap->im_agctl[agno].numfree += (INOSPEREXT - 1); imap->im_agctl[agno].numinos += INOSPEREXT; /* update the free and backed inode counts for the inode map. */ atomic_add(INOSPEREXT - 1, &imap->im_numfree); atomic_add(INOSPEREXT, &imap->im_numinos); /* write the iags. */ if (amp) write_metapage(amp); if (bmp) write_metapage(bmp); if (cmp) write_metapage(cmp); return (0); error_out: /* release the iags. */ if (amp) release_metapage(amp); if (bmp) release_metapage(bmp); if (cmp) release_metapage(cmp); return (rc); } /* * NAME: diNewIAG(imap,iagnop,agno) * * FUNCTION: allocate a new iag for an allocation group. * * first tries to allocate the iag from the inode map * iagfree list: * if the list has free iags, the head of the list is removed * and returned to satisfy the request. * if the inode map's iag free list is empty, the inode map * is extended to hold a new iag. this new iag is initialized * and returned to satisfy the request. * * PARAMETERS: * imap - pointer to inode map control structure. * iagnop - pointer to an iag number set with the number of the * newly allocated iag upon successful return. * agno - allocation group number. * bpp - Buffer pointer to be filled in with new IAG's buffer * * RETURN VALUES: * 0 - success. * -ENOSPC - insufficient disk resources. * -EIO - i/o error. * * serialization: * AG lock held on entry/exit; * write lock on the map is held inside; * read lock on the map is held on successful completion; * * note: new iag transaction: * . synchronously write iag; * . write log of xtree and inode of imap; * . commit; * . synchronous write of xtree (right to left, bottom to top); * . at start of logredo(): init in-memory imap with one additional iag page; * . at end of logredo(): re-read imap inode to determine * new imap size; */ static int diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) { int rc; int iagno, i, xlen; struct inode *ipimap; struct super_block *sb; struct jfs_sb_info *sbi; struct metapage *mp; struct iag *iagp; s64 xaddr = 0; s64 blkno; tid_t tid; struct inode *iplist[1]; /* pick up pointers to the inode map and mount inodes */ ipimap = imap->im_ipimap; sb = ipimap->i_sb; sbi = JFS_SBI(sb); /* acquire the free iag lock */ IAGFREE_LOCK(imap); /* if there are any iags on the inode map free iag list, * allocate the iag from the head of the list. */ if (imap->im_freeiag >= 0) { /* pick up the iag number at the head of the list */ iagno = imap->im_freeiag; /* determine the logical block number of the iag */ blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); } else { /* no free iags. the inode map will have to be extented * to include a new iag. */ /* acquire inode map lock */ IWRITE_LOCK(ipimap, RDWRLOCK_IMAP); if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { IWRITE_UNLOCK(ipimap); IAGFREE_UNLOCK(imap); jfs_error(imap->im_ipimap->i_sb, "ipimap->i_size is wrong\n"); return -EIO; } /* get the next available iag number */ iagno = imap->im_nextiag; /* make sure that we have not exceeded the maximum inode * number limit. */ if (iagno > (MAXIAGS - 1)) { /* release the inode map lock */ IWRITE_UNLOCK(ipimap); rc = -ENOSPC; goto out; } /* * synchronously append new iag page. */ /* determine the logical address of iag page to append */ blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); /* Allocate extent for new iag page */ xlen = sbi->nbperpage; if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) { /* release the inode map lock */ IWRITE_UNLOCK(ipimap); goto out; } /* * start transaction of update of the inode map * addressing structure pointing to the new iag page; */ tid = txBegin(sb, COMMIT_FORCE); mutex_lock(&JFS_IP(ipimap)->commit_mutex); /* update the inode map addressing structure to point to it */ if ((rc = xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) { txEnd(tid); mutex_unlock(&JFS_IP(ipimap)->commit_mutex); /* Free the blocks allocated for the iag since it was * not successfully added to the inode map */ dbFree(ipimap, xaddr, (s64) xlen); /* release the inode map lock */ IWRITE_UNLOCK(ipimap); goto out; } /* update the inode map's inode to reflect the extension */ ipimap->i_size += PSIZE; inode_add_bytes(ipimap, PSIZE); /* assign a buffer for the page */ mp = get_metapage(ipimap, blkno, PSIZE, 0); if (!mp) { /* * This is very unlikely since we just created the * extent, but let's try to handle it correctly */ xtTruncate(tid, ipimap, ipimap->i_size - PSIZE, COMMIT_PWMAP); txAbort(tid, 0); txEnd(tid); mutex_unlock(&JFS_IP(ipimap)->commit_mutex); /* release the inode map lock */ IWRITE_UNLOCK(ipimap); rc = -EIO; goto out; } iagp = (struct iag *) mp->data; /* init the iag */ memset(iagp, 0, sizeof(struct iag)); iagp->iagnum = cpu_to_le32(iagno); iagp->inofreefwd = iagp->inofreeback = cpu_to_le32(-1); iagp->extfreefwd = iagp->extfreeback = cpu_to_le32(-1); iagp->iagfree = cpu_to_le32(-1); iagp->nfreeinos = 0; iagp->nfreeexts = cpu_to_le32(EXTSPERIAG); /* initialize the free inode summary map (free extent * summary map initialization handled by bzero). */ for (i = 0; i < SMAPSZ; i++) iagp->inosmap[i] = cpu_to_le32(ONES); /* * Write and sync the metapage */ flush_metapage(mp); /* * txCommit(COMMIT_FORCE) will synchronously write address * index pages and inode after commit in careful update order * of address index pages (right to left, bottom up); */ iplist[0] = ipimap; rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); txEnd(tid); mutex_unlock(&JFS_IP(ipimap)->commit_mutex); duplicateIXtree(sb, blkno, xlen, &xaddr); /* update the next available iag number */ imap->im_nextiag += 1; /* Add the iag to the iag free list so we don't lose the iag * if a failure happens now. */ imap->im_freeiag = iagno; /* Until we have logredo working, we want the imap inode & * control page to be up to date. */ diSync(ipimap); /* release the inode map lock */ IWRITE_UNLOCK(ipimap); } /* obtain read lock on map */ IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* read the iag */ if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(ipimap); rc = -EIO; goto out; } iagp = (struct iag *) mp->data; /* remove the iag from the iag free list */ imap->im_freeiag = le32_to_cpu(iagp->iagfree); iagp->iagfree = cpu_to_le32(-1); /* set the return iag number and buffer pointer */ *iagnop = iagno; *mpp = mp; out: /* release the iag free lock */ IAGFREE_UNLOCK(imap); return (rc); } /* * NAME: diIAGRead() * * FUNCTION: get the buffer for the specified iag within a fileset * or aggregate inode map. * * PARAMETERS: * imap - pointer to inode map control structure. * iagno - iag number. * bpp - point to buffer pointer to be filled in on successful * exit. * * SERIALIZATION: * must have read lock on imap inode * (When called by diExtendFS, the filesystem is quiesced, therefore * the read lock is unnecessary.) * * RETURN VALUES: * 0 - success. * -EIO - i/o error. */ static int diIAGRead(struct inomap * imap, int iagno, struct metapage ** mpp) { struct inode *ipimap = imap->im_ipimap; s64 blkno; /* compute the logical block number of the iag. */ blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage); /* read the iag. */ *mpp = read_metapage(ipimap, blkno, PSIZE, 0); if (*mpp == NULL) { return -EIO; } return (0); } /* * NAME: diFindFree() * * FUNCTION: find the first free bit in a word starting at * the specified bit position. * * PARAMETERS: * word - word to be examined. * start - starting bit position. * * RETURN VALUES: * bit position of first free bit in the word or 32 if * no free bits were found. */ static int diFindFree(u32 word, int start) { int bitno; assert(start < 32); /* scan the word for the first free bit. */ for (word <<= start, bitno = start; bitno < 32; bitno++, word <<= 1) { if ((word & HIGHORDER) == 0) break; } return (bitno); } /* * NAME: diUpdatePMap() * * FUNCTION: Update the persistent map in an IAG for the allocation or * freeing of the specified inode. * * PRE CONDITIONS: Working map has already been updated for allocate. * * PARAMETERS: * ipimap - Incore inode map inode * inum - Number of inode to mark in permanent map * is_free - If 'true' indicates inode should be marked freed, otherwise * indicates inode should be marked allocated. * * RETURN VALUES: * 0 for success */ int diUpdatePMap(struct inode *ipimap, unsigned long inum, bool is_free, struct tblock * tblk) { int rc; struct iag *iagp; struct metapage *mp; int iagno, ino, extno, bitno; struct inomap *imap; u32 mask; struct jfs_log *log; int lsn, difft, diffp; unsigned long flags; imap = JFS_IP(ipimap)->i_imap; /* get the iag number containing the inode */ iagno = INOTOIAG(inum); /* make sure that the iag is contained within the map */ if (iagno >= imap->im_nextiag) { jfs_error(ipimap->i_sb, "the iag is outside the map\n"); return -EIO; } /* read the iag */ IREAD_LOCK(ipimap, RDWRLOCK_IMAP); rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) return (rc); metapage_wait_for_io(mp); iagp = (struct iag *) mp->data; /* get the inode number and extent number of the inode within * the iag and the inode number within the extent. */ ino = inum & (INOSPERIAG - 1); extno = ino >> L2INOSPEREXT; bitno = ino & (INOSPEREXT - 1); mask = HIGHORDER >> bitno; /* * mark the inode free in persistent map: */ if (is_free) { /* The inode should have been allocated both in working * map and in persistent map; * the inode will be freed from working map at the release * of last reference release; */ if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { jfs_error(ipimap->i_sb, "inode %ld not marked as allocated in wmap!\n", inum); } if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { jfs_error(ipimap->i_sb, "inode %ld not marked as allocated in pmap!\n", inum); } /* update the bitmap for the extent of the freed inode */ iagp->pmap[extno] &= cpu_to_le32(~mask); } /* * mark the inode allocated in persistent map: */ else { /* The inode should be already allocated in the working map * and should be free in persistent map; */ if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { release_metapage(mp); jfs_error(ipimap->i_sb, "the inode is not allocated in the working map\n"); return -EIO; } if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { release_metapage(mp); jfs_error(ipimap->i_sb, "the inode is not free in the persistent map\n"); return -EIO; } /* update the bitmap for the extent of the allocated inode */ iagp->pmap[extno] |= cpu_to_le32(mask); } /* * update iag lsn */ lsn = tblk->lsn; log = JFS_SBI(tblk->sb)->log; LOGSYNC_LOCK(log, flags); if (mp->lsn != 0) { /* inherit older/smaller lsn */ logdiff(difft, lsn, log); logdiff(diffp, mp->lsn, log); if (difft < diffp) { mp->lsn = lsn; /* move mp after tblock in logsync list */ list_move(&mp->synclist, &tblk->synclist); } /* inherit younger/larger clsn */ assert(mp->clsn); logdiff(difft, tblk->clsn, log); logdiff(diffp, mp->clsn, log); if (difft > diffp) mp->clsn = tblk->clsn; } else { mp->log = log; mp->lsn = lsn; /* insert mp after tblock in logsync list */ log->count++; list_add(&mp->synclist, &tblk->synclist); mp->clsn = tblk->clsn; } LOGSYNC_UNLOCK(log, flags); write_metapage(mp); return (0); } /* * diExtendFS() * * function: update imap for extendfs(); * * note: AG size has been increased s.t. each k old contiguous AGs are * coalesced into a new AG; */ int diExtendFS(struct inode *ipimap, struct inode *ipbmap) { int rc, rcx = 0; struct inomap *imap = JFS_IP(ipimap)->i_imap; struct iag *iagp = NULL, *hiagp = NULL; struct bmap *mp = JFS_SBI(ipbmap->i_sb)->bmap; struct metapage *bp, *hbp; int i, n, head; int numinos, xnuminos = 0, xnumfree = 0; s64 agstart; jfs_info("diExtendFS: nextiag:%d numinos:%d numfree:%d", imap->im_nextiag, atomic_read(&imap->im_numinos), atomic_read(&imap->im_numfree)); /* * reconstruct imap * * coalesce contiguous k (newAGSize/oldAGSize) AGs; * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; * note: new AG size = old AG size * (2**x). */ /* init per AG control information im_agctl[] */ for (i = 0; i < MAXAG; i++) { imap->im_agctl[i].inofree = -1; imap->im_agctl[i].extfree = -1; imap->im_agctl[i].numinos = 0; /* number of backed inodes */ imap->im_agctl[i].numfree = 0; /* number of free backed inodes */ } /* * process each iag page of the map. * * rebuild AG Free Inode List, AG Free Inode Extent List; */ for (i = 0; i < imap->im_nextiag; i++) { if ((rc = diIAGRead(imap, i, &bp))) { rcx = rc; continue; } iagp = (struct iag *) bp->data; if (le32_to_cpu(iagp->iagnum) != i) { release_metapage(bp); jfs_error(ipimap->i_sb, "unexpected value of iagnum\n"); return -EIO; } /* leave free iag in the free iag list */ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { release_metapage(bp); continue; } agstart = le64_to_cpu(iagp->agstart); n = agstart >> mp->db_agl2size; iagp->agstart = cpu_to_le64((s64)n << mp->db_agl2size); /* compute backed inodes */ numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) << L2INOSPEREXT; if (numinos > 0) { /* merge AG backed inodes */ imap->im_agctl[n].numinos += numinos; xnuminos += numinos; } /* if any backed free inodes, insert at AG free inode list */ if ((int) le32_to_cpu(iagp->nfreeinos) > 0) { if ((head = imap->im_agctl[n].inofree) == -1) { iagp->inofreefwd = cpu_to_le32(-1); iagp->inofreeback = cpu_to_le32(-1); } else { if ((rc = diIAGRead(imap, head, &hbp))) { rcx = rc; goto nextiag; } hiagp = (struct iag *) hbp->data; hiagp->inofreeback = iagp->iagnum; iagp->inofreefwd = cpu_to_le32(head); iagp->inofreeback = cpu_to_le32(-1); write_metapage(hbp); } imap->im_agctl[n].inofree = le32_to_cpu(iagp->iagnum); /* merge AG backed free inodes */ imap->im_agctl[n].numfree += le32_to_cpu(iagp->nfreeinos); xnumfree += le32_to_cpu(iagp->nfreeinos); } /* if any free extents, insert at AG free extent list */ if (le32_to_cpu(iagp->nfreeexts) > 0) { if ((head = imap->im_agctl[n].extfree) == -1) { iagp->extfreefwd = cpu_to_le32(-1); iagp->extfreeback = cpu_to_le32(-1); } else { if ((rc = diIAGRead(imap, head, &hbp))) { rcx = rc; goto nextiag; } hiagp = (struct iag *) hbp->data; hiagp->extfreeback = iagp->iagnum; iagp->extfreefwd = cpu_to_le32(head); iagp->extfreeback = cpu_to_le32(-1); write_metapage(hbp); } imap->im_agctl[n].extfree = le32_to_cpu(iagp->iagnum); } nextiag: write_metapage(bp); } if (xnuminos != atomic_read(&imap->im_numinos) || xnumfree != atomic_read(&imap->im_numfree)) { jfs_error(ipimap->i_sb, "numinos or numfree incorrect\n"); return -EIO; } return rcx; } /* * duplicateIXtree() * * serialization: IWRITE_LOCK held on entry/exit * * note: shadow page with regular inode (rel.2); */ static void duplicateIXtree(struct super_block *sb, s64 blkno, int xlen, s64 *xaddr) { struct jfs_superblock *j_sb; struct buffer_head *bh; struct inode *ip; tid_t tid; /* if AIT2 ipmap2 is bad, do not try to update it */ if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */ return; ip = diReadSpecial(sb, FILESYSTEM_I, 1); if (ip == NULL) { JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; if (readSuper(sb, &bh)) return; j_sb = (struct jfs_superblock *)bh->b_data; j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); mark_buffer_dirty(bh); sync_dirty_buffer(bh); brelse(bh); return; } /* start transaction */ tid = txBegin(sb, COMMIT_FORCE); /* update the inode map addressing structure to point to it */ if (xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0)) { JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; txAbort(tid, 1); goto cleanup; } /* update the inode map's inode to reflect the extension */ ip->i_size += PSIZE; inode_add_bytes(ip, PSIZE); txCommit(tid, 1, &ip, COMMIT_FORCE); cleanup: txEnd(tid); diFreeSpecial(ip); } /* * NAME: copy_from_dinode() * * FUNCTION: Copies inode info from disk inode to in-memory inode * * RETURN VALUES: * 0 - success * -ENOMEM - insufficient memory */ static int copy_from_dinode(struct dinode * dip, struct inode *ip) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); jfs_ip->fileset = le32_to_cpu(dip->di_fileset); jfs_ip->mode2 = le32_to_cpu(dip->di_mode); jfs_set_inode_flags(ip); ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff; if (sbi->umask != -1) { ip->i_mode = (ip->i_mode & ~0777) | (0777 & ~sbi->umask); /* For directories, add x permission if r is allowed by umask */ if (S_ISDIR(ip->i_mode)) { if (ip->i_mode & 0400) ip->i_mode |= 0100; if (ip->i_mode & 0040) ip->i_mode |= 0010; if (ip->i_mode & 0004) ip->i_mode |= 0001; } } set_nlink(ip, le32_to_cpu(dip->di_nlink)); jfs_ip->saved_uid = make_kuid(&init_user_ns, le32_to_cpu(dip->di_uid)); if (!uid_valid(sbi->uid)) ip->i_uid = jfs_ip->saved_uid; else { ip->i_uid = sbi->uid; } jfs_ip->saved_gid = make_kgid(&init_user_ns, le32_to_cpu(dip->di_gid)); if (!gid_valid(sbi->gid)) ip->i_gid = jfs_ip->saved_gid; else { ip->i_gid = sbi->gid; } ip->i_size = le64_to_cpu(dip->di_size); ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec); ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); ip->i_generation = le32_to_cpu(dip->di_gen); jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */ jfs_ip->acl = dip->di_acl; /* as are dxd's */ jfs_ip->ea = dip->di_ea; jfs_ip->next_index = le32_to_cpu(dip->di_next_index); jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec); jfs_ip->acltype = le32_to_cpu(dip->di_acltype); if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) { jfs_ip->dev = le32_to_cpu(dip->di_rdev); ip->i_rdev = new_decode_dev(jfs_ip->dev); } if (S_ISDIR(ip->i_mode)) { memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384); } else if (S_ISREG(ip->i_mode) || S_ISLNK(ip->i_mode)) { memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288); } else memcpy(&jfs_ip->i_inline_ea, &dip->di_inlineea, 128); /* Zero the in-memory-only stuff */ jfs_ip->cflag = 0; jfs_ip->btindex = 0; jfs_ip->btorder = 0; jfs_ip->bxflag = 0; jfs_ip->blid = 0; jfs_ip->atlhead = 0; jfs_ip->atltail = 0; jfs_ip->xtlid = 0; return (0); } /* * NAME: copy_to_dinode() * * FUNCTION: Copies inode info from in-memory inode to disk inode */ static void copy_to_dinode(struct dinode * dip, struct inode *ip) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); dip->di_fileset = cpu_to_le32(jfs_ip->fileset); dip->di_inostamp = cpu_to_le32(sbi->inostamp); dip->di_number = cpu_to_le32(ip->i_ino); dip->di_gen = cpu_to_le32(ip->i_generation); dip->di_size = cpu_to_le64(ip->i_size); dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); dip->di_nlink = cpu_to_le32(ip->i_nlink); if (!uid_valid(sbi->uid)) dip->di_uid = cpu_to_le32(i_uid_read(ip)); else dip->di_uid =cpu_to_le32(from_kuid(&init_user_ns, jfs_ip->saved_uid)); if (!gid_valid(sbi->gid)) dip->di_gid = cpu_to_le32(i_gid_read(ip)); else dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns, jfs_ip->saved_gid)); /* * mode2 is only needed for storing the higher order bits. * Trust i_mode for the lower order ones */ if (sbi->umask == -1) dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | ip->i_mode); else /* Leave the original permissions alone */ dip->di_mode = cpu_to_le32(jfs_ip->mode2); dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec); dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec); dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ dip->di_acl = jfs_ip->acl; /* as are dxd's */ dip->di_ea = jfs_ip->ea; dip->di_next_index = cpu_to_le32(jfs_ip->next_index); dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime); dip->di_otime.tv_nsec = 0; dip->di_acltype = cpu_to_le32(jfs_ip->acltype); if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) dip->di_rdev = cpu_to_le32(jfs_ip->dev); }
72 71 70 2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/sgi.c * * Code extracted from drivers/block/genhd.c */ #include "check.h" #include "sgi.h" struct sgi_disklabel { __be32 magic_mushroom; /* Big fat spliff... */ __be16 root_part_num; /* Root partition number */ __be16 swap_part_num; /* Swap partition number */ s8 boot_file[16]; /* Name of boot file for ARCS */ u8 _unused0[48]; /* Device parameter useless crapola.. */ struct sgi_volume { s8 name[8]; /* Name of volume */ __be32 block_num; /* Logical block number */ __be32 num_bytes; /* How big, in bytes */ } volume[15]; struct sgi_partition { __be32 num_blocks; /* Size in logical blocks */ __be32 first_block; /* First logical block */ __be32 type; /* Type of this partition */ } partitions[16]; __be32 csum; /* Disk label checksum */ __be32 _unused1; /* Padding */ }; int sgi_partition(struct parsed_partitions *state) { int i, csum; __be32 magic; int slot = 1; unsigned int start, blocks; __be32 *ui, cs; Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; char b[BDEVNAME_SIZE]; label = read_part_sector(state, 0, &sect); if (!label) return -1; p = &label->partitions[0]; magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { /*printk("Dev %s SGI disklabel: bad magic %08x\n", bdevname(bdev, b), be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } ui = ((__be32 *) (label + 1)) - 1; for(csum = 0; ui >= ((__be32 *) label);) { cs = *ui--; csum += be32_to_cpu(cs); } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", bdevname(state->bdev, b)); put_dev_sector(sect); return 0; } /* All SGI disk labels have 16 partitions, disks under Linux only * have 15 minor's. Luckily there are always a few zero length * partitions which we don't care about so we never overflow the * current_minor. */ for(i = 0; i < 16; i++, p++) { blocks = be32_to_cpu(p->num_blocks); start = be32_to_cpu(p->first_block); if (blocks) { put_partition(state, slot, start, blocks); if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; } slot++; } strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; }
76 53 53 53 48 48 2 48 48 48 50 49 51 51 51 51 51 51 51 51 51 51 49 50 1 50 49 50 50 50 45 50 50 3 50 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 50 50 45 45 45 45 45 2 45 44 5 5 5 3 3 5 5 2 5 5 5 5 5 5 5 5 5 5 5 5 2 2 45 45 43 44 43 44 1 43 51 51 51 51 7 7 51 51 51 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; state->window = &WS(strm)->working_window[0]; return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* Reverse the bytes in a 32-bit value */ #define REVERSE(q) \ ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \ (((q) & 0xff00) << 8) + (((q) & 0xff) << 24)) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; case TYPE: if (flush == Z_BLOCK) goto inf_leave; case TYPEDO: if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (state->wsize || (state->mode < CHECK && out != strm->avail_out)) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; }
19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_DMA_MAPPING_H #define _LINUX_DMA_MAPPING_H #include <linux/sizes.h> #include <linux/string.h> #include <linux/device.h> #include <linux/err.h> #include <linux/dma-debug.h> #include <linux/dma-direction.h> #include <linux/scatterlist.h> #include <linux/bug.h> #include <linux/mem_encrypt.h> /** * List of possible attributes associated with a DMA mapping. The semantics * of each attribute should be defined in Documentation/DMA-attributes.txt. * * DMA_ATTR_WRITE_BARRIER: DMA to a memory region with this attribute * forces all pending DMA writes to complete. */ #define DMA_ATTR_WRITE_BARRIER (1UL << 0) /* * DMA_ATTR_WEAK_ORDERING: Specifies that reads and writes to the mapping * may be weakly ordered, that is that reads and writes may pass each other. */ #define DMA_ATTR_WEAK_ORDERING (1UL << 1) /* * DMA_ATTR_WRITE_COMBINE: Specifies that writes to the mapping may be * buffered to improve performance. */ #define DMA_ATTR_WRITE_COMBINE (1UL << 2) /* * DMA_ATTR_NON_CONSISTENT: Lets the platform to choose to return either * consistent or non-consistent memory as it sees fit. */ #define DMA_ATTR_NON_CONSISTENT (1UL << 3) /* * DMA_ATTR_NO_KERNEL_MAPPING: Lets the platform to avoid creating a kernel * virtual mapping for the allocated buffer. */ #define DMA_ATTR_NO_KERNEL_MAPPING (1UL << 4) /* * DMA_ATTR_SKIP_CPU_SYNC: Allows platform code to skip synchronization of * the CPU cache for the given buffer assuming that it has been already * transferred to 'device' domain. */ #define DMA_ATTR_SKIP_CPU_SYNC (1UL << 5) /* * DMA_ATTR_FORCE_CONTIGUOUS: Forces contiguous allocation of the buffer * in physical memory. */ #define DMA_ATTR_FORCE_CONTIGUOUS (1UL << 6) /* * DMA_ATTR_ALLOC_SINGLE_PAGES: This is a hint to the DMA-mapping subsystem * that it's probably not worth the time to try to allocate memory to in a way * that gives better TLB efficiency. */ #define DMA_ATTR_ALLOC_SINGLE_PAGES (1UL << 7) /* * DMA_ATTR_NO_WARN: This tells the DMA-mapping subsystem to suppress * allocation failure reports (similarly to __GFP_NOWARN). */ #define DMA_ATTR_NO_WARN (1UL << 8) /* * DMA_ATTR_PRIVILEGED: used to indicate that the buffer is fully * accessible at an elevated privilege level (and ideally inaccessible or * at least read-only at lesser-privileged levels). */ #define DMA_ATTR_PRIVILEGED (1UL << 9) /* * A dma_addr_t can hold any valid DMA or bus address for the platform. * It can be given to a device to use as a DMA source or target. A CPU cannot * reference a dma_addr_t directly because there may be translation between * its physical address space and the bus address space. */ struct dma_map_ops { void* (*alloc)(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); void (*free)(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs); int (*mmap)(struct device *, struct vm_area_struct *, void *, dma_addr_t, size_t, unsigned long attrs); int (*get_sgtable)(struct device *dev, struct sg_table *sgt, void *, dma_addr_t, size_t, unsigned long attrs); dma_addr_t (*map_page)(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); void (*unmap_page)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); /* * map_sg returns 0 on error and a value > 0 on success. * It should never return a value < 0. */ int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); void (*unmap_sg)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs); dma_addr_t (*map_resource)(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs); void (*unmap_resource)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir, unsigned long attrs); void (*sync_single_for_cpu)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void (*sync_single_for_device)(struct device *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction dir); void (*sync_sg_for_cpu)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir); void (*sync_sg_for_device)(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir); int (*mapping_error)(struct device *dev, dma_addr_t dma_addr); int (*dma_supported)(struct device *dev, u64 mask); #ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK u64 (*get_required_mask)(struct device *dev); #endif int is_phys; }; extern const struct dma_map_ops dma_noop_ops; extern const struct dma_map_ops dma_virt_ops; #define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) #define DMA_MASK_NONE 0x0ULL static inline int valid_dma_direction(int dma_direction) { return ((dma_direction == DMA_BIDIRECTIONAL) || (dma_direction == DMA_TO_DEVICE) || (dma_direction == DMA_FROM_DEVICE)); } static inline int is_device_dma_capable(struct device *dev) { return dev->dma_mask != NULL && *dev->dma_mask != DMA_MASK_NONE; } #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT /* * These three functions are only for dma allocator. * Don't use them in device drivers. */ int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size, dma_addr_t *dma_handle, void **ret); int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr); int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle); int dma_release_from_global_coherent(int order, void *vaddr); int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret); #else #define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0) #define dma_release_from_dev_coherent(dev, order, vaddr) (0) #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0) static inline void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle) { return NULL; } static inline int dma_release_from_global_coherent(int order, void *vaddr) { return 0; } static inline int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr, size_t size, int *ret) { return 0; } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ #ifdef CONFIG_HAS_DMA #include <asm/dma-mapping.h> static inline const struct dma_map_ops *get_dma_ops(struct device *dev) { if (dev && dev->dma_ops) return dev->dma_ops; return get_arch_dma_ops(dev ? dev->bus : NULL); } static inline void set_dma_ops(struct device *dev, const struct dma_map_ops *dma_ops) { dev->dma_ops = dma_ops; } #else /* * Define the dma api to allow compilation but not linking of * dma dependent code. Code that depends on the dma-mapping * API needs to set 'depends on HAS_DMA' in its Kconfig */ extern const struct dma_map_ops bad_dma_ops; static inline const struct dma_map_ops *get_dma_ops(struct device *dev) { return &bad_dma_ops; } #endif static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, virt_to_page(ptr), offset_in_page(ptr), size, dir, attrs); debug_dma_map_page(dev, virt_to_page(ptr), offset_in_page(ptr), size, dir, addr, true); return addr; } static inline void dma_unmap_single_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); debug_dma_unmap_page(dev, addr, size, dir, true); } /* * dma_maps_sg_attrs returns 0 on error and > 0 on success. * It should never return a value < 0. */ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); int ents; BUG_ON(!valid_dma_direction(dir)); ents = ops->map_sg(dev, sg, nents, dir, attrs); BUG_ON(ents < 0); debug_dma_map_sg(dev, sg, nents, ents, dir); return ents; } static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); debug_dma_unmap_sg(dev, sg, nents, dir); if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); } static inline dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, size_t offset, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, page, offset, size, dir, attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, false); return addr; } static inline void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); debug_dma_unmap_page(dev, addr, size, dir, false); } static inline dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; BUG_ON(!valid_dma_direction(dir)); /* Don't allow RAM to be mapped */ BUG_ON(pfn_valid(PHYS_PFN(phys_addr))); addr = phys_addr; if (ops->map_resource) addr = ops->map_resource(dev, phys_addr, size, dir, attrs); debug_dma_map_resource(dev, phys_addr, size, dir, addr); return addr; } static inline void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->unmap_resource) ops->unmap_resource(dev, addr, size, dir, attrs); debug_dma_unmap_resource(dev, addr, size, dir); } static inline void dma_sync_single_for_cpu(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr, size, dir); debug_dma_sync_single_for_cpu(dev, addr, size, dir); } static inline void dma_sync_single_for_device(struct device *dev, dma_addr_t addr, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr, size, dir); debug_dma_sync_single_for_device(dev, addr, size, dir); } static inline void dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_single_for_cpu) ops->sync_single_for_cpu(dev, addr + offset, size, dir); debug_dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); } static inline void dma_sync_single_range_for_device(struct device *dev, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_single_for_device) ops->sync_single_for_device(dev, addr + offset, size, dir); debug_dma_sync_single_range_for_device(dev, addr, offset, size, dir); } static inline void dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_sg_for_cpu) ops->sync_sg_for_cpu(dev, sg, nelems, dir); debug_dma_sync_sg_for_cpu(dev, sg, nelems, dir); } static inline void dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, enum dma_data_direction dir) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->sync_sg_for_device) ops->sync_sg_for_device(dev, sg, nelems, dir); debug_dma_sync_sg_for_device(dev, sg, nelems, dir); } #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0) #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size); void *dma_common_contiguous_remap(struct page *page, size_t size, unsigned long vm_flags, pgprot_t prot, const void *caller); void *dma_common_pages_remap(struct page **pages, size_t size, unsigned long vm_flags, pgprot_t prot, const void *caller); void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags); /** * dma_mmap_attrs - map a coherent DMA allocation into user space * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices * @vma: vm_area_struct describing requested user mapping * @cpu_addr: kernel CPU-view address returned from dma_alloc_attrs * @handle: device-view address returned from dma_alloc_attrs * @size: size of memory originally requested in dma_alloc_attrs * @attrs: attributes of mapping properties requested in dma_alloc_attrs * * Map a coherent DMA buffer previously allocated by dma_alloc_attrs * into user space. The coherent DMA buffer must not be freed by the * driver until the user space mapping has been released. */ static inline int dma_mmap_attrs(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!ops); if (ops->mmap) return ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); } #define dma_mmap_coherent(d, v, c, h, s) dma_mmap_attrs(d, v, c, h, s, 0) int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size); static inline int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr, dma_addr_t dma_addr, size_t size, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!ops); if (ops->get_sgtable) return ops->get_sgtable(dev, sgt, cpu_addr, dma_addr, size, attrs); return dma_common_get_sgtable(dev, sgt, cpu_addr, dma_addr, size); } #define dma_get_sgtable(d, t, v, h, s) dma_get_sgtable_attrs(d, t, v, h, s, 0) #ifndef arch_dma_alloc_attrs #define arch_dma_alloc_attrs(dev, flag) (true) #endif static inline void *dma_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); void *cpu_addr; BUG_ON(!ops); if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr)) return cpu_addr; if (!arch_dma_alloc_attrs(&dev, &flag)) return NULL; if (!ops->alloc) return NULL; cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr); return cpu_addr; } static inline void dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!ops); WARN_ON(irqs_disabled()); if (dma_release_from_dev_coherent(dev, get_order(size), cpu_addr)) return; if (!ops->free || !cpu_addr) return; debug_dma_free_coherent(dev, size, cpu_addr, dma_handle); ops->free(dev, size, cpu_addr, dma_handle, attrs); } static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { return dma_alloc_attrs(dev, size, dma_handle, flag, 0); } static inline void dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_handle) { return dma_free_attrs(dev, size, cpu_addr, dma_handle, 0); } static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) { const struct dma_map_ops *ops = get_dma_ops(dev); debug_dma_mapping_error(dev, dma_addr); if (ops->mapping_error) return ops->mapping_error(dev, dma_addr); return 0; } static inline void dma_check_mask(struct device *dev, u64 mask) { if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); } static inline int dma_supported(struct device *dev, u64 mask) { const struct dma_map_ops *ops = get_dma_ops(dev); if (!ops) return 0; if (!ops->dma_supported) return 1; return ops->dma_supported(dev, mask); } #ifndef HAVE_ARCH_DMA_SET_MASK static inline int dma_set_mask(struct device *dev, u64 mask) { if (!dev->dma_mask || !dma_supported(dev, mask)) return -EIO; dma_check_mask(dev, mask); *dev->dma_mask = mask; return 0; } #endif static inline u64 dma_get_mask(struct device *dev) { if (dev && dev->dma_mask && *dev->dma_mask) return *dev->dma_mask; return DMA_BIT_MASK(32); } #ifdef CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK int dma_set_coherent_mask(struct device *dev, u64 mask); #else static inline int dma_set_coherent_mask(struct device *dev, u64 mask) { if (!dma_supported(dev, mask)) return -EIO; dma_check_mask(dev, mask); dev->coherent_dma_mask = mask; return 0; } #endif /* * Set both the DMA mask and the coherent DMA mask to the same thing. * Note that we don't check the return value from dma_set_coherent_mask() * as the DMA API guarantees that the coherent DMA mask can be set to * the same or smaller than the streaming DMA mask. */ static inline int dma_set_mask_and_coherent(struct device *dev, u64 mask) { int rc = dma_set_mask(dev, mask); if (rc == 0) dma_set_coherent_mask(dev, mask); return rc; } /* * Similar to the above, except it deals with the case where the device * does not have dev->dma_mask appropriately setup. */ static inline int dma_coerce_mask_and_coherent(struct device *dev, u64 mask) { dev->dma_mask = &dev->coherent_dma_mask; return dma_set_mask_and_coherent(dev, mask); } extern u64 dma_get_required_mask(struct device *dev); #ifndef arch_setup_dma_ops static inline void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent) { } #endif #ifndef arch_teardown_dma_ops static inline void arch_teardown_dma_ops(struct device *dev) { } #endif static inline unsigned int dma_get_max_seg_size(struct device *dev) { if (dev->dma_parms && dev->dma_parms->max_segment_size) return dev->dma_parms->max_segment_size; return SZ_64K; } static inline int dma_set_max_seg_size(struct device *dev, unsigned int size) { if (dev->dma_parms) { dev->dma_parms->max_segment_size = size; return 0; } return -EIO; } static inline unsigned long dma_get_seg_boundary(struct device *dev) { if (dev->dma_parms && dev->dma_parms->segment_boundary_mask) return dev->dma_parms->segment_boundary_mask; return DMA_BIT_MASK(32); } static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) { if (dev->dma_parms) { dev->dma_parms->segment_boundary_mask = mask; return 0; } return -EIO; } #ifndef dma_max_pfn static inline unsigned long dma_max_pfn(struct device *dev) { return *dev->dma_mask >> PAGE_SHIFT; } #endif static inline void *dma_zalloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { void *ret = dma_alloc_coherent(dev, size, dma_handle, flag | __GFP_ZERO); return ret; } static inline int dma_get_cache_alignment(void) { #ifdef ARCH_DMA_MINALIGN return ARCH_DMA_MINALIGN; #endif return 1; } /* flags for the coherent memory api */ #define DMA_MEMORY_EXCLUSIVE 0x01 #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags); void dma_release_declared_memory(struct device *dev); void *dma_mark_declared_memory_occupied(struct device *dev, dma_addr_t device_addr, size_t size); #else static inline int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags) { return -ENOSYS; } static inline void dma_release_declared_memory(struct device *dev) { } static inline void * dma_mark_declared_memory_occupied(struct device *dev, dma_addr_t device_addr, size_t size) { return ERR_PTR(-EBUSY); } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ #ifdef CONFIG_HAS_DMA int dma_configure(struct device *dev); void dma_deconfigure(struct device *dev); #else static inline int dma_configure(struct device *dev) { return 0; } static inline void dma_deconfigure(struct device *dev) {} #endif /* * Managed DMA API */ extern void *dmam_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp); extern void dmam_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle); extern void *dmam_alloc_attrs(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs); #ifdef CONFIG_HAVE_GENERIC_DMA_COHERENT extern int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, int flags); extern void dmam_release_declared_memory(struct device *dev); #else /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ static inline int dmam_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr, dma_addr_t device_addr, size_t size, gfp_t gfp) { return 0; } static inline void dmam_release_declared_memory(struct device *dev) { } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ static inline void *dma_alloc_wc(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t gfp) { return dma_alloc_attrs(dev, size, dma_addr, gfp, DMA_ATTR_WRITE_COMBINE); } #ifndef dma_alloc_writecombine #define dma_alloc_writecombine dma_alloc_wc #endif static inline void dma_free_wc(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr) { return dma_free_attrs(dev, size, cpu_addr, dma_addr, DMA_ATTR_WRITE_COMBINE); } #ifndef dma_free_writecombine #define dma_free_writecombine dma_free_wc #endif static inline int dma_mmap_wc(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size) { return dma_mmap_attrs(dev, vma, cpu_addr, dma_addr, size, DMA_ATTR_WRITE_COMBINE); } #ifndef dma_mmap_writecombine #define dma_mmap_writecombine dma_mmap_wc #endif #if defined(CONFIG_NEED_DMA_MAP_STATE) || defined(CONFIG_DMA_API_DEBUG) #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) __u32 LEN_NAME #define dma_unmap_addr(PTR, ADDR_NAME) ((PTR)->ADDR_NAME) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) (((PTR)->ADDR_NAME) = (VAL)) #define dma_unmap_len(PTR, LEN_NAME) ((PTR)->LEN_NAME) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) (((PTR)->LEN_NAME) = (VAL)) #else #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) #define dma_unmap_addr(PTR, ADDR_NAME) (0) #define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) #define dma_unmap_len(PTR, LEN_NAME) (0) #define dma_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) #endif #endif
2 1072 1269 1107 2498 51 2568 727 2425 49 17 175 243 547 546 212 1998 2291 752 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NEIGHBOUR_H #define _NET_NEIGHBOUR_H #include <linux/neighbour.h> /* * Generic neighbour manipulation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * * Harald Welte: <laforge@gnumonks.org> * - Add neighbour cache statistics like rtstat */ #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> /* * NUD stands for "neighbor unreachability detection" */ #define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE) #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) #define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE) struct neighbour; enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, NEIGH_VAR_ANYCAST_DELAY, NEIGH_VAR_PROXY_DELAY, NEIGH_VAR_LOCKTIME, #define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1) /* Following are used as a second way to access one of the above */ NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */ NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */ NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */ /* Following are used by "default" only */ NEIGH_VAR_GC_INTERVAL, NEIGH_VAR_GC_THRESH1, NEIGH_VAR_GC_THRESH2, NEIGH_VAR_GC_THRESH3, NEIGH_VAR_MAX }; struct neigh_parms { possible_net_t net; struct net_device *dev; struct list_head list; int (*neigh_setup)(struct neighbour *); void (*neigh_cleanup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; int dead; refcount_t refcnt; struct rcu_head rcu_head; int reachable_time; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); p->data[index] = val; } #define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ #define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) { bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX); } static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p) { bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX); } struct neigh_statistics { unsigned long allocs; /* number of allocated neighs */ unsigned long destroys; /* number of destroyed neighs */ unsigned long hash_grows; /* number of hash resizes */ unsigned long res_failed; /* number of failed resolutions */ unsigned long lookups; /* number of lookups */ unsigned long hits; /* number of hits (among lookups) */ unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */ unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */ unsigned long periodic_gc_runs; /* number of periodic GC runs */ unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { struct neighbour __rcu *next; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; unsigned long updated; rwlock_t lock; refcount_t refcnt; struct sk_buff_head arp_queue; unsigned int arp_queue_len_bytes; struct timer_list timer; unsigned long used; atomic_t probes; __u8 flags; __u8 nud_state; __u8 type; __u8 dead; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))]; struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct rcu_head rcu; struct net_device *dev; u8 primary_key[0]; } __randomize_layout; struct neigh_ops { int family; void (*solicit)(struct neighbour *, struct sk_buff *); void (*error_report)(struct neighbour *, struct sk_buff *); int (*output)(struct neighbour *, struct sk_buff *); int (*connected_output)(struct neighbour *, struct sk_buff *); }; struct pneigh_entry { struct pneigh_entry *next; possible_net_t net; struct net_device *dev; u8 flags; u8 key[0]; }; /* * neighbour table manipulation */ #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { struct neighbour __rcu **hash_buckets; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; }; struct neigh_table { int family; int entry_size; int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); bool (*key_eq)(const struct neighbour *, const void *pkey); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); char *id; struct neigh_parms parms; struct list_head parms_list; int gc_interval; int gc_thresh1; int gc_thresh2; int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; rwlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; struct pneigh_entry **phash_buckets; }; enum { NEIGH_ARP_TABLE = 0, NEIGH_ND_TABLE = 1, NEIGH_DN_TABLE = 2, NEIGH_NR_TABLES, NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ }; static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; } #define NEIGH_PRIV_ALIGN sizeof(long long) #define NEIGH_ENTRY_SIZE(size) ALIGN((size), NEIGH_PRIV_ALIGN) static inline void *neighbour_priv(const struct neighbour *n) { return (char *)n + n->tbl->entry_size; } /* flags for neigh_update() */ #define NEIGH_UPDATE_F_OVERRIDE 0x00000001 #define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002 #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER 0x00000004 #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 static inline bool neigh_key_eq16(const struct neighbour *n, const void *pkey) { return *(const u16 *)n->primary_key == *(const u16 *)pkey; } static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; } static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey) { const u32 *n32 = (const u32 *)n->primary_key; const u32 *p32 = pkey; return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0; } static inline struct neighbour *___neigh_lookup_noref( struct neigh_table *tbl, bool (*key_eq)(const struct neighbour *n, const void *pkey), __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd), const void *pkey, struct net_device *dev) { struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht); struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); n != NULL; n = rcu_dereference_bh(n->next)) { if (n->dev == dev && key_eq(n, pkey)) return n; } return NULL; } static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); } void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, const void *pkey); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev); struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl); void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); static inline struct net *neigh_parms_net(const struct neigh_parms *parms) { return read_pnet(&parms->net); } unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev, int creat); struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); static inline struct net *pneigh_net(const struct pneigh_entry *pneigh) { return read_pnet(&pneigh->net); } void neigh_app_ns(struct neighbour *n); void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); void pneigh_for_each(struct neigh_table *tbl, void (*cb)(struct pneigh_entry *)); struct neigh_seq_state { struct seq_net_private p; struct neigh_table *tbl; struct neigh_hash_table *nht; void *(*neigh_sub_iter)(struct neigh_seq_state *state, struct neighbour *n, loff_t *pos); unsigned int bucket; unsigned int flags; #define NEIGH_SEQ_NEIGH_ONLY 0x00000001 #define NEIGH_SEQ_IS_PNEIGH 0x00000002 #define NEIGH_SEQ_SKIP_NOARP 0x00000004 }; void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, unsigned int); void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); int neigh_proc_dointvec(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *proc_handler); void neigh_sysctl_unregister(struct neigh_parms *p); static inline void __neigh_parms_put(struct neigh_parms *parms) { refcount_dec(&parms->refcnt); } static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms) { refcount_inc(&parms->refcnt); return parms; } /* * Neighbour references */ static inline void neigh_release(struct neighbour *neigh) { if (refcount_dec_and_test(&neigh->refcnt)) neigh_destroy(neigh); } static inline struct neighbour * neigh_clone(struct neighbour *neigh) { if (neigh) refcount_inc(&neigh->refcnt); return neigh; } #define neigh_hold(n) refcount_inc(&(n)->refcnt) static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { unsigned long now = jiffies; if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) return __neigh_event_send(neigh, skb); return 0; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; do { seq = read_seqbegin(&hh->hh_lock); hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; } #endif static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { unsigned int hh_alen = 0; unsigned int seq; unsigned int hh_len; do { seq = read_seqbegin(&hh->hh_lock); hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; /* skb_push() would proceed silently if we have room for * the unaligned size but not for the aligned size: * check headroom explicitly. */ if (likely(skb_headroom(skb) >= HH_DATA_MOD)) { /* this is inlined by gcc */ memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); } } else { hh_alen = HH_DATA_ALIGN(hh_len); if (likely(skb_headroom(skb) >= hh_alen)) { memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } } } while (read_seqretry(&hh->hh_lock, seq)); if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) { kfree_skb(skb); return NET_XMIT_DROP; } __skb_push(skb, hh_len); return dev_queue_xmit(skb); } static inline int neigh_output(struct neighbour *n, struct sk_buff *skb) { const struct hh_cache *hh = &n->hh; if ((n->nud_state & NUD_CONNECTED) && hh->hh_len) return neigh_hh_output(hh, skb); else return n->output(n, skb); } static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n || !creat) return n; n = neigh_create(tbl, pkey, dev); return IS_ERR(n) ? NULL : n; } static inline struct neighbour * __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n) return n; return neigh_create(tbl, pkey, dev); } struct neighbour_cb { unsigned long sched_next; unsigned int flags; }; #define LOCALLY_ENQUEUED 0x1 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, const struct net_device *dev) { unsigned int seq; do { seq = read_seqbegin(&n->ha_lock); memcpy(dst, n->ha, dev->addr_len); } while (read_seqretry(&n->ha_lock, seq)); } #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 // SPDX-License-Identifier: GPL-2.0 #include "reiserfs.h" #include <linux/errno.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/xattr.h> #include "xattr.h" #include <linux/uaccess.h> static int user_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; return reiserfs_xattr_get(inode, xattr_full_name(handler, name), buffer, size); } static int user_set(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { if (!reiserfs_xattrs_user(inode->i_sb)) return -EOPNOTSUPP; return reiserfs_xattr_set(inode, xattr_full_name(handler, name), buffer, size, flags); } static bool user_list(struct dentry *dentry) { return reiserfs_xattrs_user(dentry->d_sb); } const struct xattr_handler reiserfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .get = user_get, .set = user_set, .list = user_list, };
2 2 2 2 2 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 /* * linux/fs/nls/nls_iso8859-9.c * * Charset iso8859-9 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* 0x90*/ 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* 0xa0*/ 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* 0xb0*/ 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* 0xc0*/ 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* 0xd0*/ 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, /* 0xe0*/ 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* 0xf0*/ 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0x00, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x00, 0x00, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0x00, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x00, 0x00, 0xff, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xd0, 0xf0, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0xdd, 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x50-0x57 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xde, 0xfe, /* 0x58-0x5f */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xc0-0xc7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xc8-0xcf */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xd7, /* 0xd0-0xd7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0x69, 0xfe, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0x00, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xe0-0xe7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xe8-0xef */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xf7, /* 0xf0-0xf7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0x49, 0xde, 0x00, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "iso8859-9", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_iso8859_9(void) { return register_nls(&table); } static void __exit exit_nls_iso8859_9(void) { unregister_nls(&table); } module_init(init_nls_iso8859_9) module_exit(exit_nls_iso8859_9) MODULE_LICENSE("Dual BSD/GPL");
4 2 2 2 4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 /* * Misc and compatibility things * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/init.h> #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/fs.h> #include <sound/core.h> #ifdef CONFIG_SND_DEBUG #ifdef CONFIG_SND_DEBUG_VERBOSE #define DEFAULT_DEBUG_LEVEL 2 #else #define DEFAULT_DEBUG_LEVEL 1 #endif static int debug = DEFAULT_DEBUG_LEVEL; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Debug level (0 = disable)"); #endif /* CONFIG_SND_DEBUG */ void release_and_free_resource(struct resource *res) { if (res) { release_resource(res); kfree(res); } } EXPORT_SYMBOL(release_and_free_resource); #ifdef CONFIG_SND_VERBOSE_PRINTK /* strip the leading path if the given path is absolute */ static const char *sanity_file_name(const char *path) { if (*path == '/') return strrchr(path, '/') + 1; else return path; } #endif #if defined(CONFIG_SND_DEBUG) || defined(CONFIG_SND_VERBOSE_PRINTK) void __snd_printk(unsigned int level, const char *path, int line, const char *format, ...) { va_list args; #ifdef CONFIG_SND_VERBOSE_PRINTK int kern_level; struct va_format vaf; char verbose_fmt[] = KERN_DEFAULT "ALSA %s:%d %pV"; bool level_found = false; #endif #ifdef CONFIG_SND_DEBUG if (debug < level) return; #endif va_start(args, format); #ifdef CONFIG_SND_VERBOSE_PRINTK vaf.fmt = format; vaf.va = &args; while ((kern_level = printk_get_level(vaf.fmt)) != 0) { const char *end_of_header = printk_skip_level(vaf.fmt); /* Ignore KERN_CONT. We print filename:line for each piece. */ if (kern_level >= '0' && kern_level <= '7') { memcpy(verbose_fmt, vaf.fmt, end_of_header - vaf.fmt); level_found = true; } vaf.fmt = end_of_header; } if (!level_found && level) memcpy(verbose_fmt, KERN_DEBUG, sizeof(KERN_DEBUG) - 1); printk(verbose_fmt, sanity_file_name(path), line, &vaf); #else vprintk(format, args); #endif va_end(args); } EXPORT_SYMBOL_GPL(__snd_printk); #endif #ifdef CONFIG_PCI #include <linux/pci.h> /** * snd_pci_quirk_lookup_id - look up a PCI SSID quirk list * @vendor: PCI SSV id * @device: PCI SSD id * @list: quirk list, terminated by a null entry * * Look through the given quirk list and finds a matching entry * with the same PCI SSID. When subdevice is 0, all subdevice * values may match. * * Returns the matched entry pointer, or NULL if nothing matched. */ const struct snd_pci_quirk * snd_pci_quirk_lookup_id(u16 vendor, u16 device, const struct snd_pci_quirk *list) { const struct snd_pci_quirk *q; for (q = list; q->subvendor; q++) { if (q->subvendor != vendor) continue; if (!q->subdevice || (device & q->subdevice_mask) == q->subdevice) return q; } return NULL; } EXPORT_SYMBOL(snd_pci_quirk_lookup_id); /** * snd_pci_quirk_lookup - look up a PCI SSID quirk list * @pci: pci_dev handle * @list: quirk list, terminated by a null entry * * Look through the given quirk list and finds a matching entry * with the same PCI SSID. When subdevice is 0, all subdevice * values may match. * * Returns the matched entry pointer, or NULL if nothing matched. */ const struct snd_pci_quirk * snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list) { if (!pci) return NULL; return snd_pci_quirk_lookup_id(pci->subsystem_vendor, pci->subsystem_device, list); } EXPORT_SYMBOL(snd_pci_quirk_lookup); #endif /* * Deferred async signal helpers * * Below are a few helper functions to wrap the async signal handling * in the deferred work. The main purpose is to avoid the messy deadlock * around tasklist_lock and co at the kill_fasync() invocation. * fasync_helper() and kill_fasync() are replaced with snd_fasync_helper() * and snd_kill_fasync(), respectively. In addition, snd_fasync_free() has * to be called at releasing the relevant file object. */ struct snd_fasync { struct fasync_struct *fasync; int signal; int poll; int on; struct list_head list; }; static DEFINE_SPINLOCK(snd_fasync_lock); static LIST_HEAD(snd_fasync_list); static void snd_fasync_work_fn(struct work_struct *work) { struct snd_fasync *fasync; spin_lock_irq(&snd_fasync_lock); while (!list_empty(&snd_fasync_list)) { fasync = list_first_entry(&snd_fasync_list, struct snd_fasync, list); list_del_init(&fasync->list); spin_unlock_irq(&snd_fasync_lock); if (fasync->on) kill_fasync(&fasync->fasync, fasync->signal, fasync->poll); spin_lock_irq(&snd_fasync_lock); } spin_unlock_irq(&snd_fasync_lock); } static DECLARE_WORK(snd_fasync_work, snd_fasync_work_fn); int snd_fasync_helper(int fd, struct file *file, int on, struct snd_fasync **fasyncp) { struct snd_fasync *fasync = NULL; if (on) { fasync = kzalloc(sizeof(*fasync), GFP_KERNEL); if (!fasync) return -ENOMEM; INIT_LIST_HEAD(&fasync->list); } spin_lock_irq(&snd_fasync_lock); if (*fasyncp) { kfree(fasync); fasync = *fasyncp; } else { if (!fasync) { spin_unlock_irq(&snd_fasync_lock); return 0; } *fasyncp = fasync; } fasync->on = on; spin_unlock_irq(&snd_fasync_lock); return fasync_helper(fd, file, on, &fasync->fasync); } EXPORT_SYMBOL_GPL(snd_fasync_helper); void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll) { unsigned long flags; if (!fasync || !fasync->on) return; spin_lock_irqsave(&snd_fasync_lock, flags); fasync->signal = signal; fasync->poll = poll; list_move(&fasync->list, &snd_fasync_list); schedule_work(&snd_fasync_work); spin_unlock_irqrestore(&snd_fasync_lock, flags); } EXPORT_SYMBOL_GPL(snd_kill_fasync); void snd_fasync_free(struct snd_fasync *fasync) { if (!fasync) return; fasync->on = 0; flush_work(&snd_fasync_work); kfree(fasync); } EXPORT_SYMBOL_GPL(snd_fasync_free);
3 18 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 /* * Memory-to-memory device framework for Video for Linux 2. * * Helper functions for devices that use memory buffers for both source * and destination. * * Copyright (c) 2009 Samsung Electronics Co., Ltd. * Pawel Osciak, <pawel@osciak.com> * Marek Szyprowski, <m.szyprowski@samsung.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 2 of the * License, or (at your option) any later version */ #ifndef _MEDIA_V4L2_MEM2MEM_H #define _MEDIA_V4L2_MEM2MEM_H #include <media/videobuf2-v4l2.h> /** * struct v4l2_m2m_ops - mem-to-mem device driver callbacks * @device_run: required. Begin the actual job (transaction) inside this * callback. * The job does NOT have to end before this callback returns * (and it will be the usual case). When the job finishes, * v4l2_m2m_job_finish() has to be called. * @job_ready: optional. Should return 0 if the driver does not have a job * fully prepared to run yet (i.e. it will not be able to finish a * transaction without sleeping). If not provided, it will be * assumed that one source and one destination buffer are all * that is required for the driver to perform one full transaction. * This method may not sleep. * @job_abort: required. Informs the driver that it has to abort the currently * running transaction as soon as possible (i.e. as soon as it can * stop the device safely; e.g. in the next interrupt handler), * even if the transaction would not have been finished by then. * After the driver performs the necessary steps, it has to call * v4l2_m2m_job_finish() (as if the transaction ended normally). * This function does not have to (and will usually not) wait * until the device enters a state when it can be stopped. * @lock: optional. Define a driver's own lock callback, instead of using * &v4l2_m2m_ctx->q_lock. * @unlock: optional. Define a driver's own unlock callback, instead of * using &v4l2_m2m_ctx->q_lock. */ struct v4l2_m2m_ops { void (*device_run)(void *priv); int (*job_ready)(void *priv); void (*job_abort)(void *priv); void (*lock)(void *priv); void (*unlock)(void *priv); }; struct v4l2_m2m_dev; /** * struct v4l2_m2m_queue_ctx - represents a queue for buffers ready to be * processed * * @q: pointer to struct &vb2_queue * @rdy_queue: List of V4L2 mem-to-mem queues * @rdy_spinlock: spin lock to protect the struct usage * @num_rdy: number of buffers ready to be processed * @buffered: is the queue buffered? * * Queue for buffers ready to be processed as soon as this * instance receives access to the device. */ struct v4l2_m2m_queue_ctx { struct vb2_queue q; struct list_head rdy_queue; spinlock_t rdy_spinlock; u8 num_rdy; bool buffered; }; /** * struct v4l2_m2m_ctx - Memory to memory context structure * * @q_lock: struct &mutex lock * @m2m_dev: opaque pointer to the internal data to handle M2M context * @cap_q_ctx: Capture (output to memory) queue context * @out_q_ctx: Output (input from memory) queue context * @queue: List of memory to memory contexts * @job_flags: Job queue flags, used internally by v4l2-mem2mem.c: * %TRANS_QUEUED, %TRANS_RUNNING and %TRANS_ABORT. * @finished: Wait queue used to signalize when a job queue finished. * @priv: Instance private data * * The memory to memory context is specific to a file handle, NOT to e.g. * a device. */ struct v4l2_m2m_ctx { /* optional cap/out vb2 queues lock */ struct mutex *q_lock; /* internal use only */ struct v4l2_m2m_dev *m2m_dev; struct v4l2_m2m_queue_ctx cap_q_ctx; struct v4l2_m2m_queue_ctx out_q_ctx; /* For device job queue */ struct list_head queue; unsigned long job_flags; wait_queue_head_t finished; void *priv; }; /** * struct v4l2_m2m_buffer - Memory to memory buffer * * @vb: pointer to struct &vb2_v4l2_buffer * @list: list of m2m buffers */ struct v4l2_m2m_buffer { struct vb2_v4l2_buffer vb; struct list_head list; }; /** * v4l2_m2m_get_curr_priv() - return driver private data for the currently * running instance or NULL if no instance is running * * @m2m_dev: opaque pointer to the internal data to handle M2M context */ void *v4l2_m2m_get_curr_priv(struct v4l2_m2m_dev *m2m_dev); /** * v4l2_m2m_get_vq() - return vb2_queue for the given type * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @type: type of the V4L2 buffer, as defined by enum &v4l2_buf_type */ struct vb2_queue *v4l2_m2m_get_vq(struct v4l2_m2m_ctx *m2m_ctx, enum v4l2_buf_type type); /** * v4l2_m2m_try_schedule() - check whether an instance is ready to be added to * the pending job queue and add it if so. * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * * There are three basic requirements an instance has to meet to be able to run: * 1) at least one source buffer has to be queued, * 2) at least one destination buffer has to be queued, * 3) streaming has to be on. * * If a queue is buffered (for example a decoder hardware ringbuffer that has * to be drained before doing streamoff), allow scheduling without v4l2 buffers * on that queue. * * There may also be additional, custom requirements. In such case the driver * should supply a custom callback (job_ready in v4l2_m2m_ops) that should * return 1 if the instance is ready. * An example of the above could be an instance that requires more than one * src/dst buffer per transaction. */ void v4l2_m2m_try_schedule(struct v4l2_m2m_ctx *m2m_ctx); /** * v4l2_m2m_job_finish() - inform the framework that a job has been finished * and have it clean up * * @m2m_dev: opaque pointer to the internal data to handle M2M context * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * * Called by a driver to yield back the device after it has finished with it. * Should be called as soon as possible after reaching a state which allows * other instances to take control of the device. * * This function has to be called only after &v4l2_m2m_ops->device_run * callback has been called on the driver. To prevent recursion, it should * not be called directly from the &v4l2_m2m_ops->device_run callback though. */ void v4l2_m2m_job_finish(struct v4l2_m2m_dev *m2m_dev, struct v4l2_m2m_ctx *m2m_ctx); static inline void v4l2_m2m_buf_done(struct vb2_v4l2_buffer *buf, enum vb2_buffer_state state) { vb2_buffer_done(&buf->vb2_buf, state); } /** * v4l2_m2m_reqbufs() - multi-queue-aware REQBUFS multiplexer * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @reqbufs: pointer to struct &v4l2_requestbuffers */ int v4l2_m2m_reqbufs(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_requestbuffers *reqbufs); /** * v4l2_m2m_querybuf() - multi-queue-aware QUERYBUF multiplexer * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @buf: pointer to struct &v4l2_buffer * * See v4l2_m2m_mmap() documentation for details. */ int v4l2_m2m_querybuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_buffer *buf); /** * v4l2_m2m_qbuf() - enqueue a source or destination buffer, depending on * the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @buf: pointer to struct &v4l2_buffer */ int v4l2_m2m_qbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_buffer *buf); /** * v4l2_m2m_dqbuf() - dequeue a source or destination buffer, depending on * the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @buf: pointer to struct &v4l2_buffer */ int v4l2_m2m_dqbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_buffer *buf); /** * v4l2_m2m_prepare_buf() - prepare a source or destination buffer, depending on * the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @buf: pointer to struct &v4l2_buffer */ int v4l2_m2m_prepare_buf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_buffer *buf); /** * v4l2_m2m_create_bufs() - create a source or destination buffer, depending * on the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @create: pointer to struct &v4l2_create_buffers */ int v4l2_m2m_create_bufs(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_create_buffers *create); /** * v4l2_m2m_expbuf() - export a source or destination buffer, depending on * the type * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @eb: pointer to struct &v4l2_exportbuffer */ int v4l2_m2m_expbuf(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct v4l2_exportbuffer *eb); /** * v4l2_m2m_streamon() - turn on streaming for a video queue * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @type: type of the V4L2 buffer, as defined by enum &v4l2_buf_type */ int v4l2_m2m_streamon(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, enum v4l2_buf_type type); /** * v4l2_m2m_streamoff() - turn off streaming for a video queue * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @type: type of the V4L2 buffer, as defined by enum &v4l2_buf_type */ int v4l2_m2m_streamoff(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, enum v4l2_buf_type type); /** * v4l2_m2m_poll() - poll replacement, for destination buffers only * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @wait: pointer to struct &poll_table_struct * * Call from the driver's poll() function. Will poll both queues. If a buffer * is available to dequeue (with dqbuf) from the source queue, this will * indicate that a non-blocking write can be performed, while read will be * returned in case of the destination queue. */ unsigned int v4l2_m2m_poll(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct poll_table_struct *wait); /** * v4l2_m2m_mmap() - source and destination queues-aware mmap multiplexer * * @file: pointer to struct &file * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vma: pointer to struct &vm_area_struct * * Call from driver's mmap() function. Will handle mmap() for both queues * seamlessly for videobuffer, which will receive normal per-queue offsets and * proper videobuf queue pointers. The differentiation is made outside videobuf * by adding a predefined offset to buffers from one of the queues and * subtracting it before passing it back to videobuf. Only drivers (and * thus applications) receive modified offsets. */ int v4l2_m2m_mmap(struct file *file, struct v4l2_m2m_ctx *m2m_ctx, struct vm_area_struct *vma); /** * v4l2_m2m_init() - initialize per-driver m2m data * * @m2m_ops: pointer to struct v4l2_m2m_ops * * Usually called from driver's ``probe()`` function. * * Return: returns an opaque pointer to the internal data to handle M2M context */ struct v4l2_m2m_dev *v4l2_m2m_init(const struct v4l2_m2m_ops *m2m_ops); /** * v4l2_m2m_release() - cleans up and frees a m2m_dev structure * * @m2m_dev: opaque pointer to the internal data to handle M2M context * * Usually called from driver's ``remove()`` function. */ void v4l2_m2m_release(struct v4l2_m2m_dev *m2m_dev); /** * v4l2_m2m_ctx_init() - allocate and initialize a m2m context * * @m2m_dev: opaque pointer to the internal data to handle M2M context * @drv_priv: driver's instance private data * @queue_init: a callback for queue type-specific initialization function * to be used for initializing videobuf_queues * * Usually called from driver's ``open()`` function. */ struct v4l2_m2m_ctx *v4l2_m2m_ctx_init(struct v4l2_m2m_dev *m2m_dev, void *drv_priv, int (*queue_init)(void *priv, struct vb2_queue *src_vq, struct vb2_queue *dst_vq)); static inline void v4l2_m2m_set_src_buffered(struct v4l2_m2m_ctx *m2m_ctx, bool buffered) { m2m_ctx->out_q_ctx.buffered = buffered; } static inline void v4l2_m2m_set_dst_buffered(struct v4l2_m2m_ctx *m2m_ctx, bool buffered) { m2m_ctx->cap_q_ctx.buffered = buffered; } /** * v4l2_m2m_ctx_release() - release m2m context * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * * Usually called from driver's release() function. */ void v4l2_m2m_ctx_release(struct v4l2_m2m_ctx *m2m_ctx); /** * v4l2_m2m_buf_queue() - add a buffer to the proper ready buffers list. * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vbuf: pointer to struct &vb2_v4l2_buffer * * Call from videobuf_queue_ops->ops->buf_queue, videobuf_queue_ops callback. */ void v4l2_m2m_buf_queue(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_v4l2_buffer *vbuf); /** * v4l2_m2m_num_src_bufs_ready() - return the number of source buffers ready for * use * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline unsigned int v4l2_m2m_num_src_bufs_ready(struct v4l2_m2m_ctx *m2m_ctx) { return m2m_ctx->out_q_ctx.num_rdy; } /** * v4l2_m2m_num_dst_bufs_ready() - return the number of destination buffers * ready for use * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline unsigned int v4l2_m2m_num_dst_bufs_ready(struct v4l2_m2m_ctx *m2m_ctx) { return m2m_ctx->cap_q_ctx.num_rdy; } /** * v4l2_m2m_next_buf() - return next buffer from the list of ready buffers * * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx */ void *v4l2_m2m_next_buf(struct v4l2_m2m_queue_ctx *q_ctx); /** * v4l2_m2m_next_src_buf() - return next source buffer from the list of ready * buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline void *v4l2_m2m_next_src_buf(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_next_buf(&m2m_ctx->out_q_ctx); } /** * v4l2_m2m_next_dst_buf() - return next destination buffer from the list of * ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline void *v4l2_m2m_next_dst_buf(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_next_buf(&m2m_ctx->cap_q_ctx); } /** * v4l2_m2m_for_each_dst_buf() - iterate over a list of destination ready * buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @b: current buffer of type struct v4l2_m2m_buffer */ #define v4l2_m2m_for_each_dst_buf(m2m_ctx, b) \ list_for_each_entry(b, &m2m_ctx->cap_q_ctx.rdy_queue, list) /** * v4l2_m2m_for_each_src_buf() - iterate over a list of source ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @b: current buffer of type struct v4l2_m2m_buffer */ #define v4l2_m2m_for_each_src_buf(m2m_ctx, b) \ list_for_each_entry(b, &m2m_ctx->out_q_ctx.rdy_queue, list) /** * v4l2_m2m_for_each_dst_buf_safe() - iterate over a list of destination ready * buffers safely * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @b: current buffer of type struct v4l2_m2m_buffer * @n: used as temporary storage */ #define v4l2_m2m_for_each_dst_buf_safe(m2m_ctx, b, n) \ list_for_each_entry_safe(b, n, &m2m_ctx->cap_q_ctx.rdy_queue, list) /** * v4l2_m2m_for_each_src_buf_safe() - iterate over a list of source ready * buffers safely * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @b: current buffer of type struct v4l2_m2m_buffer * @n: used as temporary storage */ #define v4l2_m2m_for_each_src_buf_safe(m2m_ctx, b, n) \ list_for_each_entry_safe(b, n, &m2m_ctx->out_q_ctx.rdy_queue, list) /** * v4l2_m2m_get_src_vq() - return vb2_queue for source buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_queue *v4l2_m2m_get_src_vq(struct v4l2_m2m_ctx *m2m_ctx) { return &m2m_ctx->out_q_ctx.q; } /** * v4l2_m2m_get_dst_vq() - return vb2_queue for destination buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline struct vb2_queue *v4l2_m2m_get_dst_vq(struct v4l2_m2m_ctx *m2m_ctx) { return &m2m_ctx->cap_q_ctx.q; } /** * v4l2_m2m_buf_remove() - take off a buffer from the list of ready buffers and * return it * * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx */ void *v4l2_m2m_buf_remove(struct v4l2_m2m_queue_ctx *q_ctx); /** * v4l2_m2m_src_buf_remove() - take off a source buffer from the list of ready * buffers and return it * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline void *v4l2_m2m_src_buf_remove(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_buf_remove(&m2m_ctx->out_q_ctx); } /** * v4l2_m2m_dst_buf_remove() - take off a destination buffer from the list of * ready buffers and return it * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx */ static inline void *v4l2_m2m_dst_buf_remove(struct v4l2_m2m_ctx *m2m_ctx) { return v4l2_m2m_buf_remove(&m2m_ctx->cap_q_ctx); } /** * v4l2_m2m_buf_remove_by_buf() - take off exact buffer from the list of ready * buffers * * @q_ctx: pointer to struct @v4l2_m2m_queue_ctx * @vbuf: the buffer to be removed */ void v4l2_m2m_buf_remove_by_buf(struct v4l2_m2m_queue_ctx *q_ctx, struct vb2_v4l2_buffer *vbuf); /** * v4l2_m2m_src_buf_remove_by_buf() - take off exact source buffer from the list * of ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vbuf: the buffer to be removed */ static inline void v4l2_m2m_src_buf_remove_by_buf(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_v4l2_buffer *vbuf) { v4l2_m2m_buf_remove_by_buf(&m2m_ctx->out_q_ctx, vbuf); } /** * v4l2_m2m_dst_buf_remove_by_buf() - take off exact destination buffer from the * list of ready buffers * * @m2m_ctx: m2m context assigned to the instance given by struct &v4l2_m2m_ctx * @vbuf: the buffer to be removed */ static inline void v4l2_m2m_dst_buf_remove_by_buf(struct v4l2_m2m_ctx *m2m_ctx, struct vb2_v4l2_buffer *vbuf) { v4l2_m2m_buf_remove_by_buf(&m2m_ctx->cap_q_ctx, vbuf); } struct vb2_v4l2_buffer * v4l2_m2m_buf_remove_by_idx(struct v4l2_m2m_queue_ctx *q_ctx, unsigned int idx); static inline struct vb2_v4l2_buffer * v4l2_m2m_src_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx) { return v4l2_m2m_buf_remove_by_idx(&m2m_ctx->out_q_ctx, idx); } static inline struct vb2_v4l2_buffer * v4l2_m2m_dst_buf_remove_by_idx(struct v4l2_m2m_ctx *m2m_ctx, unsigned int idx) { return v4l2_m2m_buf_remove_by_idx(&m2m_ctx->cap_q_ctx, idx); } /* v4l2 ioctl helpers */ int v4l2_m2m_ioctl_reqbufs(struct file *file, void *priv, struct v4l2_requestbuffers *rb); int v4l2_m2m_ioctl_create_bufs(struct file *file, void *fh, struct v4l2_create_buffers *create); int v4l2_m2m_ioctl_querybuf(struct file *file, void *fh, struct v4l2_buffer *buf); int v4l2_m2m_ioctl_expbuf(struct file *file, void *fh, struct v4l2_exportbuffer *eb); int v4l2_m2m_ioctl_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf); int v4l2_m2m_ioctl_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf); int v4l2_m2m_ioctl_prepare_buf(struct file *file, void *fh, struct v4l2_buffer *buf); int v4l2_m2m_ioctl_streamon(struct file *file, void *fh, enum v4l2_buf_type type); int v4l2_m2m_ioctl_streamoff(struct file *file, void *fh, enum v4l2_buf_type type); int v4l2_m2m_fop_mmap(struct file *file, struct vm_area_struct *vma); unsigned int v4l2_m2m_fop_poll(struct file *file, poll_table *wait); #endif /* _MEDIA_V4L2_MEM2MEM_H */
294 325 325 322 73 24 73 74 73 24 73 48 49 18 49 31 74 74 72 3 3 3 3 3 2 3 3 77 143 79 221 223 224 222 1 223 222 174 224 224 1 105 83 83 105 12 11 12 11 109 109 109 109 109 40 34 36 20 105 105 105 105 104 1 105 105 98 69 109 109 3 3 3 11 2 11 1437 2 2 2 2 2 2 10 4 10 10 2 2 2425 2424 2423 2424 2424 2423 2425 80 2423 2425 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 /* * fs/kernfs/file.c - kernfs file implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> * * This file is released under the GPLv2. */ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/pagemap.h> #include <linux/sched/mm.h> #include <linux/fsnotify.h> #include "kernfs-internal.h" /* * There's one kernfs_open_file for each open file and one kernfs_open_node * for each kernfs_node with one or more open files. * * kernfs_node->attr.open points to kernfs_open_node. attr.open is * protected by kernfs_open_node_lock. * * filp->private_data points to seq_file whose ->private points to * kernfs_open_file. kernfs_open_files are chained at * kernfs_open_node->files, which is protected by kernfs_open_file_mutex. */ static DEFINE_SPINLOCK(kernfs_open_node_lock); static DEFINE_MUTEX(kernfs_open_file_mutex); struct kernfs_open_node { atomic_t refcnt; atomic_t event; wait_queue_head_t poll; struct list_head files; /* goes through kernfs_open_file.list */ }; /* * kernfs_notify() may be called from any context and bounces notifications * through a work item. To minimize space overhead in kernfs_node, the * pending queue is implemented as a singly linked list of kernfs_nodes. * The list is terminated with the self pointer so that whether a * kernfs_node is on the list or not can be determined by testing the next * pointer for NULL. */ #define KERNFS_NOTIFY_EOL ((void *)&kernfs_notify_list) static DEFINE_SPINLOCK(kernfs_notify_lock); static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL; static struct kernfs_open_file *kernfs_of(struct file *file) { return ((struct seq_file *)file->private_data)->private; } /* * Determine the kernfs_ops for the given kernfs_node. This function must * be called while holding an active reference. */ static const struct kernfs_ops *kernfs_ops(struct kernfs_node *kn) { if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); return kn->attr.ops; } /* * As kernfs_seq_stop() is also called after kernfs_seq_start() or * kernfs_seq_next() failure, it needs to distinguish whether it's stopping * a seq_file iteration which is fully initialized with an active reference * or an aborted kernfs_seq_start() due to get_active failure. The * position pointer is the only context for each seq_file iteration and * thus the stop condition should be encoded in it. As the return value is * directly visible to userland, ERR_PTR(-ENODEV) is the only acceptable * choice to indicate get_active failure. * * Unfortunately, this is complicated due to the optional custom seq_file * operations which may return ERR_PTR(-ENODEV) too. kernfs_seq_stop() * can't distinguish whether ERR_PTR(-ENODEV) is from get_active failure or * custom seq_file operations and thus can't decide whether put_active * should be performed or not only on ERR_PTR(-ENODEV). * * This is worked around by factoring out the custom seq_stop() and * put_active part into kernfs_seq_stop_active(), skipping it from * kernfs_seq_stop() if ERR_PTR(-ENODEV) while invoking it directly after * custom seq_file operations fail with ERR_PTR(-ENODEV) - this ensures * that kernfs_seq_stop_active() is skipped only after get_active failure. */ static void kernfs_seq_stop_active(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_stop) ops->seq_stop(sf, v); kernfs_put_active(of->kn); } static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops; /* * @of->mutex nests outside active ref and is primarily to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) return ERR_PTR(-ENODEV); ops = kernfs_ops(of->kn); if (ops->seq_start) { void *next = ops->seq_start(sf, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(). Returns * !NULL if pos is at the beginning; otherwise, NULL. */ return NULL + !*ppos; } } static void *kernfs_seq_next(struct seq_file *sf, void *v, loff_t *ppos) { struct kernfs_open_file *of = sf->private; const struct kernfs_ops *ops = kernfs_ops(of->kn); if (ops->seq_next) { void *next = ops->seq_next(sf, v, ppos); /* see the comment above kernfs_seq_stop_active() */ if (next == ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, next); return next; } else { /* * The same behavior and code as single_open(), always * terminate after the initial read. */ ++*ppos; return NULL; } } static void kernfs_seq_stop(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; if (v != ERR_PTR(-ENODEV)) kernfs_seq_stop_active(sf, v); mutex_unlock(&of->mutex); } static int kernfs_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; of->event = atomic_read(&of->kn->attr.open->event); return of->kn->attr.ops->seq_show(sf, v); } static const struct seq_operations kernfs_seq_ops = { .start = kernfs_seq_start, .next = kernfs_seq_next, .stop = kernfs_seq_stop, .show = kernfs_seq_show, }; /* * As reading a bin file can have side-effects, the exact offset and bytes * specified in read(2) call should be passed to the read callback making * it difficult to use seq_file. Implement simplistic custom buffering for * bin files. */ static ssize_t kernfs_file_direct_read(struct kernfs_open_file *of, char __user *user_buf, size_t count, loff_t *ppos) { ssize_t len = min_t(size_t, count, PAGE_SIZE); const struct kernfs_ops *ops; char *buf; buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len, GFP_KERNEL); if (!buf) return -ENOMEM; /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { len = -ENODEV; mutex_unlock(&of->mutex); goto out_free; } of->event = atomic_read(&of->kn->attr.open->event); ops = kernfs_ops(of->kn); if (ops->read) len = ops->read(of, buf, len, *ppos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len < 0) goto out_free; if (copy_to_user(user_buf, buf, len)) { len = -EFAULT; goto out_free; } *ppos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } /** * kernfs_fop_read - kernfs vfs read callback * @file: file pointer * @user_buf: data to write * @count: number of bytes * @ppos: starting offset */ static ssize_t kernfs_fop_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { struct kernfs_open_file *of = kernfs_of(file); if (of->kn->flags & KERNFS_HAS_SEQ_SHOW) return seq_read(file, user_buf, count, ppos); else return kernfs_file_direct_read(of, user_buf, count, ppos); } /** * kernfs_fop_write - kernfs vfs write callback * @file: file pointer * @user_buf: data to write * @count: number of bytes * @ppos: starting offset * * Copy data in from userland and pass it to the matching kernfs write * operation. * * There is no easy way for us to know if userspace is only doing a partial * write, so we don't support them. We expect the entire buffer to come on * the first write. Hint: if you're writing a value, first read the file, * modify only the the value you're changing, then write entire buffer * back. */ static ssize_t kernfs_fop_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; ssize_t len; char *buf; if (of->atomic_write_len) { len = count; if (len > of->atomic_write_len) return -E2BIG; } else { len = min_t(size_t, count, PAGE_SIZE); } buf = of->prealloc_buf; if (buf) mutex_lock(&of->prealloc_mutex); else buf = kmalloc(len + 1, GFP_KERNEL); if (!buf) return -ENOMEM; if (copy_from_user(buf, user_buf, len)) { len = -EFAULT; goto out_free; } buf[len] = '\0'; /* guarantee string termination */ /* * @of->mutex nests outside active ref and is used both to ensure that * the ops aren't called concurrently for the same open file. */ mutex_lock(&of->mutex); if (!kernfs_get_active(of->kn)) { mutex_unlock(&of->mutex); len = -ENODEV; goto out_free; } ops = kernfs_ops(of->kn); if (ops->write) len = ops->write(of, buf, len, *ppos); else len = -EINVAL; kernfs_put_active(of->kn); mutex_unlock(&of->mutex); if (len > 0) *ppos += len; out_free: if (buf == of->prealloc_buf) mutex_unlock(&of->prealloc_mutex); else kfree(buf); return len; } static void kernfs_vma_open(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); if (!of->vm_ops) return; if (!kernfs_get_active(of->kn)) return; if (of->vm_ops->open) of->vm_ops->open(vma); kernfs_put_active(of->kn); } static int kernfs_vma_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS; if (of->vm_ops->fault) ret = of->vm_ops->fault(vmf); kernfs_put_active(of->kn); return ret; } static int kernfs_vma_page_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return VM_FAULT_SIGBUS; if (!kernfs_get_active(of->kn)) return VM_FAULT_SIGBUS; ret = 0; if (of->vm_ops->page_mkwrite) ret = of->vm_ops->page_mkwrite(vmf); else file_update_time(file); kernfs_put_active(of->kn); return ret; } static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return -EINVAL; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = -EINVAL; if (of->vm_ops->access) ret = of->vm_ops->access(vma, addr, buf, len, write); kernfs_put_active(of->kn); return ret; } #ifdef CONFIG_NUMA static int kernfs_vma_set_policy(struct vm_area_struct *vma, struct mempolicy *new) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); int ret; if (!of->vm_ops) return 0; if (!kernfs_get_active(of->kn)) return -EINVAL; ret = 0; if (of->vm_ops->set_policy) ret = of->vm_ops->set_policy(vma, new); kernfs_put_active(of->kn); return ret; } static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, unsigned long addr) { struct file *file = vma->vm_file; struct kernfs_open_file *of = kernfs_of(file); struct mempolicy *pol; if (!of->vm_ops) return vma->vm_policy; if (!kernfs_get_active(of->kn)) return vma->vm_policy; pol = vma->vm_policy; if (of->vm_ops->get_policy) pol = of->vm_ops->get_policy(vma, addr); kernfs_put_active(of->kn); return pol; } #endif static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, #ifdef CONFIG_NUMA .set_policy = kernfs_vma_set_policy, .get_policy = kernfs_vma_get_policy, #endif }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) { struct kernfs_open_file *of = kernfs_of(file); const struct kernfs_ops *ops; int rc; /* * mmap path and of->mutex are prone to triggering spurious lockdep * warnings and we don't want to add spurious locking dependency * between the two. Check whether mmap is actually implemented * without grabbing @of->mutex by testing HAS_MMAP flag. See the * comment in kernfs_file_open() for more details. */ if (!(of->kn->flags & KERNFS_HAS_MMAP)) return -ENODEV; mutex_lock(&of->mutex); rc = -ENODEV; if (!kernfs_get_active(of->kn)) goto out_unlock; ops = kernfs_ops(of->kn); rc = ops->mmap(of, vma); if (rc) goto out_put; /* * PowerPC's pci_mmap of legacy_mem uses shmem_zero_setup() * to satisfy versions of X which crash if the mmap fails: that * substitutes a new vm_file, and we don't then want bin_vm_ops. */ if (vma->vm_file != file) goto out_put; rc = -EINVAL; if (of->mmapped && of->vm_ops != vma->vm_ops) goto out_put; /* * It is not possible to successfully wrap close. * So error if someone is trying to use close. */ rc = -EINVAL; if (vma->vm_ops && vma->vm_ops->close) goto out_put; rc = 0; of->mmapped = true; of->vm_ops = vma->vm_ops; vma->vm_ops = &kernfs_vm_ops; out_put: kernfs_put_active(of->kn); out_unlock: mutex_unlock(&of->mutex); return rc; } /** * kernfs_get_open_node - get or create kernfs_open_node * @kn: target kernfs_node * @of: kernfs_open_file for this instance of open * * If @kn->attr.open exists, increment its reference count; otherwise, * create one. @of is chained to the files list. * * LOCKING: * Kernel thread context (may sleep). * * RETURNS: * 0 on success, -errno on failure. */ static int kernfs_get_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on, *new_on = NULL; retry: mutex_lock(&kernfs_open_file_mutex); spin_lock_irq(&kernfs_open_node_lock); if (!kn->attr.open && new_on) { kn->attr.open = new_on; new_on = NULL; } on = kn->attr.open; if (on) { atomic_inc(&on->refcnt); list_add_tail(&of->list, &on->files); } spin_unlock_irq(&kernfs_open_node_lock); mutex_unlock(&kernfs_open_file_mutex); if (on) { kfree(new_on); return 0; } /* not there, initialize a new one and retry */ new_on = kmalloc(sizeof(*new_on), GFP_KERNEL); if (!new_on) return -ENOMEM; atomic_set(&new_on->refcnt, 0); atomic_set(&new_on->event, 1); init_waitqueue_head(&new_on->poll); INIT_LIST_HEAD(&new_on->files); goto retry; } /** * kernfs_put_open_node - put kernfs_open_node * @kn: target kernfs_nodet * @of: associated kernfs_open_file * * Put @kn->attr.open and unlink @of from the files list. If * reference count reaches zero, disassociate and free it. * * LOCKING: * None. */ static void kernfs_put_open_node(struct kernfs_node *kn, struct kernfs_open_file *of) { struct kernfs_open_node *on = kn->attr.open; unsigned long flags; mutex_lock(&kernfs_open_file_mutex); spin_lock_irqsave(&kernfs_open_node_lock, flags); if (of) list_del(&of->list); if (atomic_dec_and_test(&on->refcnt)) kn->attr.open = NULL; else on = NULL; spin_unlock_irqrestore(&kernfs_open_node_lock, flags); mutex_unlock(&kernfs_open_file_mutex); kfree(on); } static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; int error = -EACCES; if (!kernfs_get_active(kn)) return -ENODEV; ops = kernfs_ops(kn); has_read = ops->seq_show || ops->read || ops->mmap; has_write = ops->write || ops->mmap; has_mmap = ops->mmap; /* see the flag definition for details */ if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { if ((file->f_mode & FMODE_WRITE) && (!(inode->i_mode & S_IWUGO) || !has_write)) goto err_out; if ((file->f_mode & FMODE_READ) && (!(inode->i_mode & S_IRUGO) || !has_read)) goto err_out; } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL); if (!of) goto err_out; /* * The following is done to give a different lockdep key to * @of->mutex for files which implement mmap. This is a rather * crude way to avoid false positive lockdep warning around * mm->mmap_sem - mmap nests @of->mutex under mm->mmap_sem and * reading /sys/block/sda/trace/act_mask grabs sr_mutex, under * which mm->mmap_sem nests, while holding @of->mutex. As each * open file has a separate mutex, it's okay as long as those don't * happen on the same file. At this point, we can't easily give * each file a separate locking class. Let's differentiate on * whether the file has mmap or not for now. * * Both paths of the branch look the same. They're supposed to * look that way and give @of->mutex different static lockdep keys. */ if (has_mmap) mutex_init(&of->mutex); else mutex_init(&of->mutex); of->kn = kn; of->file = file; /* * Write path needs to atomic_write_len outside active reference. * Cache it in open_file. See kernfs_fop_write() for details. */ of->atomic_write_len = ops->atomic_write_len; error = -EINVAL; /* * ->seq_show is incompatible with ->prealloc, * as seq_read does its own allocation. * ->read must be used instead. */ if (ops->prealloc && ops->seq_show) goto err_free; if (ops->prealloc) { int len = of->atomic_write_len ?: PAGE_SIZE; of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL); error = -ENOMEM; if (!of->prealloc_buf) goto err_free; mutex_init(&of->prealloc_mutex); } /* * Always instantiate seq_file even if read access doesn't use * seq_file or is not requested. This unifies private data access * and readable regular files are the vast majority anyway. */ if (ops->seq_show) error = seq_open(file, &kernfs_seq_ops); else error = seq_open(file, NULL); if (error) goto err_free; of->seq_file = file->private_data; of->seq_file->private = of; /* seq_file clears PWRITE unconditionally, restore it if WRITE */ if (file->f_mode & FMODE_WRITE) file->f_mode |= FMODE_PWRITE; /* make sure we have open node struct */ error = kernfs_get_open_node(kn, of); if (error) goto err_seq_release; if (ops->open) { /* nobody has access to @of yet, skip @of->mutex */ error = ops->open(of); if (error) goto err_put_node; } /* open succeeded, put active references */ kernfs_put_active(kn); return 0; err_put_node: kernfs_put_open_node(kn, of); err_seq_release: seq_release(inode, file); err_free: kfree(of->prealloc_buf); kfree(of); err_out: kernfs_put_active(kn); return error; } /* used from release/drain to ensure that ->release() is called exactly once */ static void kernfs_release_file(struct kernfs_node *kn, struct kernfs_open_file *of) { /* * @of is guaranteed to have no other file operations in flight and * we just want to synchronize release and drain paths. * @kernfs_open_file_mutex is enough. @of->mutex can't be used * here because drain path may be called from places which can * cause circular dependency. */ lockdep_assert_held(&kernfs_open_file_mutex); if (!of->released) { /* * A file is never detached without being released and we * need to be able to release files which are deactivated * and being drained. Don't use kernfs_ops(). */ kn->attr.ops->release(of); of->released = true; } } static int kernfs_fop_release(struct inode *inode, struct file *filp) { struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { mutex_lock(&kernfs_open_file_mutex); kernfs_release_file(kn, of); mutex_unlock(&kernfs_open_file_mutex); } kernfs_put_open_node(kn, of); seq_release(inode, filp); kfree(of->prealloc_buf); kfree(of); return 0; } void kernfs_drain_open_files(struct kernfs_node *kn) { struct kernfs_open_node *on; struct kernfs_open_file *of; if (!(kn->flags & (KERNFS_HAS_MMAP | KERNFS_HAS_RELEASE))) return; spin_lock_irq(&kernfs_open_node_lock); on = kn->attr.open; if (on) atomic_inc(&on->refcnt); spin_unlock_irq(&kernfs_open_node_lock); if (!on) return; mutex_lock(&kernfs_open_file_mutex); list_for_each_entry(of, &on->files, list) { struct inode *inode = file_inode(of->file); if (kn->flags & KERNFS_HAS_MMAP) unmap_mapping_range(inode->i_mapping, 0, 0, 1); if (kn->flags & KERNFS_HAS_RELEASE) kernfs_release_file(kn, of); } mutex_unlock(&kernfs_open_file_mutex); kernfs_put_open_node(kn, NULL); } /* * Kernfs attribute files are pollable. The idea is that you read * the content and then you use 'poll' or 'select' to wait for * the content to change. When the content changes (assuming the * manager for the kobject supports notification), poll will * return POLLERR|POLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return * an appropriate error code). When in doubt, set a suitable timeout value. */ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); struct kernfs_open_node *on = kn->attr.open; if (!kernfs_get_active(kn)) goto trigger; poll_wait(filp, &on->poll, wait); kernfs_put_active(kn); if (of->event != atomic_read(&on->event)) goto trigger; return DEFAULT_POLLMASK; trigger: return DEFAULT_POLLMASK|POLLERR|POLLPRI; } static void kernfs_notify_workfn(struct work_struct *work) { struct kernfs_node *kn; struct kernfs_open_node *on; struct kernfs_super_info *info; repeat: /* pop one off the notify_list */ spin_lock_irq(&kernfs_notify_lock); kn = kernfs_notify_list; if (kn == KERNFS_NOTIFY_EOL) { spin_unlock_irq(&kernfs_notify_lock); return; } kernfs_notify_list = kn->attr.notify_next; kn->attr.notify_next = NULL; spin_unlock_irq(&kernfs_notify_lock); /* kick poll */ spin_lock_irq(&kernfs_open_node_lock); on = kn->attr.open; if (on) { atomic_inc(&on->event); wake_up_interruptible(&on->poll); } spin_unlock_irq(&kernfs_open_node_lock); /* kick fsnotify */ mutex_lock(&kernfs_mutex); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *inode; /* * We want fsnotify_modify() on @kn but as the * modifications aren't originating from userland don't * have the matching @file available. Look up the inodes * and generate the events manually. */ inode = ilookup(info->sb, kn->id.ino); if (!inode) continue; parent = kernfs_get_parent(kn); if (parent) { struct inode *p_inode; p_inode = ilookup(info->sb, parent->id.ino); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, kn->name, 0); iput(p_inode); } kernfs_put(parent); } fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE, kn->name, 0); iput(inode); } mutex_unlock(&kernfs_mutex); kernfs_put(kn); goto repeat; } /** * kernfs_notify - notify a kernfs file * @kn: file to notify * * Notify @kn such that poll(2) on @kn wakes up. Maybe be called from any * context. */ void kernfs_notify(struct kernfs_node *kn) { static DECLARE_WORK(kernfs_notify_work, kernfs_notify_workfn); unsigned long flags; if (WARN_ON(kernfs_type(kn) != KERNFS_FILE)) return; spin_lock_irqsave(&kernfs_notify_lock, flags); if (!kn->attr.notify_next) { kernfs_get(kn); kn->attr.notify_next = kernfs_notify_list; kernfs_notify_list = kn; schedule_work(&kernfs_notify_work); } spin_unlock_irqrestore(&kernfs_notify_lock, flags); } EXPORT_SYMBOL_GPL(kernfs_notify); const struct file_operations kernfs_file_fops = { .read = kernfs_fop_read, .write = kernfs_fop_write, .llseek = generic_file_llseek, .mmap = kernfs_fop_mmap, .open = kernfs_fop_open, .release = kernfs_fop_release, .poll = kernfs_fop_poll, .fsync = noop_fsync, }; /** * __kernfs_create_file - kernfs internal function to create a file * @parent: directory to create the file in * @name: name of the file * @mode: mode of the file * @size: size of the file * @ops: kernfs operations for the file * @priv: private data for the file * @ns: optional namespace tag of the file * @key: lockdep key for the file's active_ref, %NULL to disable lockdep * * Returns the created node on success, ERR_PTR() value on error. */ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { struct kernfs_node *kn; unsigned flags; int rc; flags = KERNFS_FILE; kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags); if (!kn) return ERR_PTR(-ENOMEM); kn->attr.ops = ops; kn->attr.size = size; kn->ns = ns; kn->priv = priv; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { lockdep_init_map(&kn->dep_map, "kn->count", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif /* * kn->attr.ops is accesible only while holding active ref. We * need to know whether some ops are implemented outside active * ref. Cache their existence in flags. */ if (ops->seq_show) kn->flags |= KERNFS_HAS_SEQ_SHOW; if (ops->mmap) kn->flags |= KERNFS_HAS_MMAP; if (ops->release) kn->flags |= KERNFS_HAS_RELEASE; rc = kernfs_add_one(kn); if (rc) { kernfs_put(kn); return ERR_PTR(rc); } return kn; }
7447 1 209 5118 4497 4450 4492 6692 5144 4499 5146 17 17 10 17 602 481 601 4962 179 178 179 4967 4961 6713 6713 428 54 53 53 455 33 42 32 4689 1 5150 5148 5147 31 31 31 31 28 29 31 5064 5050 5056 5058 5057 395 395 2 395 689 690 149 574 690 690 673 667 667 6 662 662 667 667 662 673 396 395 378 395 390 395 396 5026 5027 1112 1114 5027 5028 5017 2114 4079 4141 1265 858 3464 87 12 31 29 28 30 7 34 34 33 33 33 32 31 24 30 4 27 23 4 17 13 14 14 1 29 27 27 28 28 25 11 10 1 9 8 7 3 6 35 7 28 28 35 73 5149 3 2 2 222 223 223 223 15 12 12 12 3652 3 3 3 1 3652 3652 623 3647 1 7716 7623 7718 238 237 2087 2087 7670 5142 5147 5145 4 5148 5147 4 4840 13 3656 3646 3646 3067 3067 1664 2833 2837 2834 2819 2835 2 2833 2836 1199 12 2840 4142 4144 10 4141 4144 6 538 39 39 35 26 34 2 4 3 10 10 8 6 7 4 2 3 3 2 2 2 1 2 1 2 2 2 21 19 18 21 2 1 1 3 2 1 3 2 2 3 3 3 3 2 1 1 3 2 1 1 1 5161 5158 60 5156 552 550 548 4997 492 5154 5153 5156 27 5147 5147 50 5151 604 603 599 138 599 12 12 599 599 10 599 162 5 603 28 28 28 28 28 27 28 27 28 28 28 28 28 28 4 631 630 70 70 588 631 629 342 1 396 185 394 5 625 593 597 596 594 195 592 583 3 6 6 590 3408 2842 2844 3 2 761 2 2 2 3406 3411 6 4 3 3 1 1 1 3401 4996 4986 4970 4954 4952 4220 4693 4663 148 4660 2075 2073 78 12 2005 2072 18 21 20 20 20 18 20 21 21 16 16 21 20 21 21 21 20 1 21 12 28 28 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 /* * NETLINK Kernel-user communication protocol. * * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Patrick McHardy <kaber@trash.net> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith * added netlink_proto_exit * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br> * use nlk_sk, as sk->protinfo is on a diet 8) * Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org> * - inc module use count of module that owns * the kernel socket in case userspace opens * socket of same protocol * - remove all module support, since netlink is * mandatory if CONFIG_NET=y these days */ #include <linux/module.h> #include <linux/capability.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/un.h> #include <linux/fcntl.h> #include <linux/termios.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/security.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/random.h> #include <linux/bitops.h> #include <linux/mm.h> #include <linux/types.h> #include <linux/audit.h> #include <linux/mutex.h> #include <linux/vmalloc.h> #include <linux/if_arp.h> #include <linux/rhashtable.h> #include <asm/cacheflush.h> #include <linux/hash.h> #include <linux/genetlink.h> #include <linux/net_namespace.h> #include <linux/nospec.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/scm.h> #include <net/netlink.h> #include "af_netlink.h" struct listeners { struct rcu_head rcu; unsigned long masks[0]; }; /* state bits */ #define NETLINK_S_CONGESTED 0x0 static inline int netlink_is_kernel(struct sock *sk) { return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; } struct netlink_table *nl_table __read_mostly; EXPORT_SYMBOL_GPL(nl_table); static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS]; static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = { "nlk_cb_mutex-ROUTE", "nlk_cb_mutex-1", "nlk_cb_mutex-USERSOCK", "nlk_cb_mutex-FIREWALL", "nlk_cb_mutex-SOCK_DIAG", "nlk_cb_mutex-NFLOG", "nlk_cb_mutex-XFRM", "nlk_cb_mutex-SELINUX", "nlk_cb_mutex-ISCSI", "nlk_cb_mutex-AUDIT", "nlk_cb_mutex-FIB_LOOKUP", "nlk_cb_mutex-CONNECTOR", "nlk_cb_mutex-NETFILTER", "nlk_cb_mutex-IP6_FW", "nlk_cb_mutex-DNRTMSG", "nlk_cb_mutex-KOBJECT_UEVENT", "nlk_cb_mutex-GENERIC", "nlk_cb_mutex-17", "nlk_cb_mutex-SCSITRANSPORT", "nlk_cb_mutex-ECRYPTFS", "nlk_cb_mutex-RDMA", "nlk_cb_mutex-CRYPTO", "nlk_cb_mutex-SMC", "nlk_cb_mutex-23", "nlk_cb_mutex-24", "nlk_cb_mutex-25", "nlk_cb_mutex-26", "nlk_cb_mutex-27", "nlk_cb_mutex-28", "nlk_cb_mutex-29", "nlk_cb_mutex-30", "nlk_cb_mutex-31", "nlk_cb_mutex-MAX_LINKS" }; static int netlink_dump(struct sock *sk); static void netlink_skb_destructor(struct sk_buff *skb); /* nl_table locking explained: * Lookup and traversal are protected with an RCU read-side lock. Insertion * and removal are protected with per bucket lock while using RCU list * modification primitives and may run in parallel to RCU protected lookups. * Destruction of the Netlink socket may only occur *after* nl_table_lock has * been acquired * either during or after the socket has been removed from * the list and after an RCU grace period. */ DEFINE_RWLOCK(nl_table_lock); EXPORT_SYMBOL_GPL(nl_table_lock); static atomic_t nl_table_users = ATOMIC_INIT(0); #define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock)); static BLOCKING_NOTIFIER_HEAD(netlink_chain); static DEFINE_SPINLOCK(netlink_tap_lock); static struct list_head netlink_tap_all __read_mostly; static const struct rhashtable_params netlink_rhashtable_params; static inline u32 netlink_group_mask(u32 group) { if (group > 32) return 0; return group ? 1 << (group - 1) : 0; } static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb, gfp_t gfp_mask) { unsigned int len = skb_end_offset(skb); struct sk_buff *new; new = alloc_skb(len, gfp_mask); if (new == NULL) return NULL; NETLINK_CB(new).portid = NETLINK_CB(skb).portid; NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group; NETLINK_CB(new).creds = NETLINK_CB(skb).creds; skb_put_data(new, skb->data, len); return new; } int netlink_add_tap(struct netlink_tap *nt) { if (unlikely(nt->dev->type != ARPHRD_NETLINK)) return -EINVAL; spin_lock(&netlink_tap_lock); list_add_rcu(&nt->list, &netlink_tap_all); spin_unlock(&netlink_tap_lock); __module_get(nt->module); return 0; } EXPORT_SYMBOL_GPL(netlink_add_tap); static int __netlink_remove_tap(struct netlink_tap *nt) { bool found = false; struct netlink_tap *tmp; spin_lock(&netlink_tap_lock); list_for_each_entry(tmp, &netlink_tap_all, list) { if (nt == tmp) { list_del_rcu(&nt->list); found = true; goto out; } } pr_warn("__netlink_remove_tap: %p not found\n", nt); out: spin_unlock(&netlink_tap_lock); if (found) module_put(nt->module); return found ? 0 : -ENODEV; } int netlink_remove_tap(struct netlink_tap *nt) { int ret; ret = __netlink_remove_tap(nt); synchronize_net(); return ret; } EXPORT_SYMBOL_GPL(netlink_remove_tap); static bool netlink_filter_tap(const struct sk_buff *skb) { struct sock *sk = skb->sk; /* We take the more conservative approach and * whitelist socket protocols that may pass. */ switch (sk->sk_protocol) { case NETLINK_ROUTE: case NETLINK_USERSOCK: case NETLINK_SOCK_DIAG: case NETLINK_NFLOG: case NETLINK_XFRM: case NETLINK_FIB_LOOKUP: case NETLINK_NETFILTER: case NETLINK_GENERIC: return true; } return false; } static int __netlink_deliver_tap_skb(struct sk_buff *skb, struct net_device *dev) { struct sk_buff *nskb; struct sock *sk = skb->sk; int ret = -ENOMEM; if (!net_eq(dev_net(dev), sock_net(sk))) return 0; dev_hold(dev); if (is_vmalloc_addr(skb->head)) nskb = netlink_to_full_skb(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); if (nskb) { nskb->dev = dev; nskb->protocol = htons((u16) sk->sk_protocol); nskb->pkt_type = netlink_is_kernel(sk) ? PACKET_KERNEL : PACKET_USER; skb_reset_network_header(nskb); ret = dev_queue_xmit(nskb); if (unlikely(ret > 0)) ret = net_xmit_errno(ret); } dev_put(dev); return ret; } static void __netlink_deliver_tap(struct sk_buff *skb) { int ret; struct netlink_tap *tmp; if (!netlink_filter_tap(skb)) return; list_for_each_entry_rcu(tmp, &netlink_tap_all, list) { ret = __netlink_deliver_tap_skb(skb, tmp->dev); if (unlikely(ret)) break; } } static void netlink_deliver_tap(struct sk_buff *skb) { rcu_read_lock(); if (unlikely(!list_empty(&netlink_tap_all))) __netlink_deliver_tap(skb); rcu_read_unlock(); } static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src, struct sk_buff *skb) { if (!(netlink_is_kernel(dst) && netlink_is_kernel(src))) netlink_deliver_tap(skb); } static void netlink_overrun(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (!(nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)) { if (!test_and_set_bit(NETLINK_S_CONGESTED, &nlk_sk(sk)->state)) { sk->sk_err = ENOBUFS; sk->sk_error_report(sk); } } atomic_inc(&sk->sk_drops); } static void netlink_rcv_wake(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (skb_queue_empty(&sk->sk_receive_queue)) clear_bit(NETLINK_S_CONGESTED, &nlk->state); if (!test_bit(NETLINK_S_CONGESTED, &nlk->state)) wake_up_interruptible(&nlk->wait); } static void netlink_skb_destructor(struct sk_buff *skb) { if (is_vmalloc_addr(skb->head)) { if (!skb->cloned || !atomic_dec_return(&(skb_shinfo(skb)->dataref))) vfree(skb->head); skb->head = NULL; } if (skb->sk != NULL) sock_rfree(skb); } static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { WARN_ON(skb->sk != NULL); skb->sk = sk; skb->destructor = netlink_skb_destructor; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static void netlink_sock_destruct(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->cb_running) { if (nlk->cb.done) nlk->cb.done(&nlk->cb); module_put(nlk->cb.module); kfree_skb(nlk->cb.skb); } skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); WARN_ON(nlk_sk(sk)->groups); } static void netlink_sock_destruct_work(struct work_struct *work) { struct netlink_sock *nlk = container_of(work, struct netlink_sock, work); sk_free(&nlk->sk); } /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves * this, _but_ remember, it adds useless work on UP machines. */ void netlink_table_grab(void) __acquires(nl_table_lock) { might_sleep(); write_lock_irq(&nl_table_lock); if (atomic_read(&nl_table_users)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(&nl_table_wait, &wait); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&nl_table_users) == 0) break; write_unlock_irq(&nl_table_lock); schedule(); write_lock_irq(&nl_table_lock); } __set_current_state(TASK_RUNNING); remove_wait_queue(&nl_table_wait, &wait); } } void netlink_table_ungrab(void) __releases(nl_table_lock) { write_unlock_irq(&nl_table_lock); wake_up(&nl_table_wait); } static inline void netlink_lock_table(void) { unsigned long flags; /* read_lock() synchronizes us to netlink_table_grab */ read_lock_irqsave(&nl_table_lock, flags); atomic_inc(&nl_table_users); read_unlock_irqrestore(&nl_table_lock, flags); } static inline void netlink_unlock_table(void) { if (atomic_dec_and_test(&nl_table_users)) wake_up(&nl_table_wait); } struct netlink_compare_arg { possible_net_t pnet; u32 portid; }; /* Doing sizeof directly may yield 4 extra bytes on 64-bit. */ #define netlink_compare_arg_len \ (offsetof(struct netlink_compare_arg, portid) + sizeof(u32)) static inline int netlink_compare(struct rhashtable_compare_arg *arg, const void *ptr) { const struct netlink_compare_arg *x = arg->key; const struct netlink_sock *nlk = ptr; return nlk->portid != x->portid || !net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet)); } static void netlink_compare_arg_init(struct netlink_compare_arg *arg, struct net *net, u32 portid) { memset(arg, 0, sizeof(*arg)); write_pnet(&arg->pnet, net); arg->portid = portid; } static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid, struct net *net) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, net, portid); return rhashtable_lookup_fast(&table->hash, &arg, netlink_rhashtable_params); } static int __netlink_insert(struct netlink_table *table, struct sock *sk) { struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid); return rhashtable_lookup_insert_key(&table->hash, &arg, &nlk_sk(sk)->node, netlink_rhashtable_params); } static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid) { struct netlink_table *table = &nl_table[protocol]; struct sock *sk; rcu_read_lock(); sk = __netlink_lookup(table, portid, net); if (sk) sock_hold(sk); rcu_read_unlock(); return sk; } static const struct proto_ops netlink_ops; static void netlink_update_listeners(struct sock *sk) { struct netlink_table *tbl = &nl_table[sk->sk_protocol]; unsigned long mask; unsigned int i; struct listeners *listeners; listeners = nl_deref_protected(tbl->listeners); if (!listeners) return; for (i = 0; i < NLGRPLONGS(tbl->groups); i++) { mask = 0; sk_for_each_bound(sk, &tbl->mc_list) { if (i < NLGRPLONGS(nlk_sk(sk)->ngroups)) mask |= nlk_sk(sk)->groups[i]; } listeners->masks[i] = mask; } /* this function is only called with the netlink table "grabbed", which * makes sure updates are visible before bind or setsockopt return. */ } static int netlink_insert(struct sock *sk, u32 portid) { struct netlink_table *table = &nl_table[sk->sk_protocol]; int err; lock_sock(sk); err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY; if (nlk_sk(sk)->bound) goto err; err = -ENOMEM; if (BITS_PER_LONG > 32 && unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX)) goto err; nlk_sk(sk)->portid = portid; sock_hold(sk); err = __netlink_insert(table, sk); if (err) { /* In case the hashtable backend returns with -EBUSY * from here, it must not escape to the caller. */ if (unlikely(err == -EBUSY)) err = -EOVERFLOW; if (err == -EEXIST) err = -EADDRINUSE; sock_put(sk); goto err; } /* We need to ensure that the socket is hashed and visible. */ smp_wmb(); /* Paired with lockless reads from netlink_bind(), * netlink_connect() and netlink_sendmsg(). */ WRITE_ONCE(nlk_sk(sk)->bound, portid); err: release_sock(sk); return err; } static void netlink_remove(struct sock *sk) { struct netlink_table *table; table = &nl_table[sk->sk_protocol]; if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, netlink_rhashtable_params)) { WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } netlink_table_grab(); if (nlk_sk(sk)->subscriptions) { __sk_del_bind_node(sk); netlink_update_listeners(sk); } if (sk->sk_protocol == NETLINK_GENERIC) atomic_inc(&genl_sk_destructing_cnt); netlink_table_ungrab(); } static struct proto netlink_proto = { .name = "NETLINK", .owner = THIS_MODULE, .obj_size = sizeof(struct netlink_sock), }; static int __netlink_create(struct net *net, struct socket *sock, struct mutex *cb_mutex, int protocol, int kern) { struct sock *sk; struct netlink_sock *nlk; sock->ops = &netlink_ops; sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern); if (!sk) return -ENOMEM; sock_init_data(sock, sk); nlk = nlk_sk(sk); if (cb_mutex) { nlk->cb_mutex = cb_mutex; } else { nlk->cb_mutex = &nlk->cb_def_mutex; mutex_init(nlk->cb_mutex); lockdep_set_class_and_name(nlk->cb_mutex, nlk_cb_mutex_keys + protocol, nlk_cb_mutex_key_strings[protocol]); } init_waitqueue_head(&nlk->wait); sk->sk_destruct = netlink_sock_destruct; sk->sk_protocol = protocol; return 0; } static int netlink_create(struct net *net, struct socket *sock, int protocol, int kern) { struct module *module = NULL; struct mutex *cb_mutex; struct netlink_sock *nlk; int (*bind)(struct net *net, int group); void (*unbind)(struct net *net, int group); int err = 0; sock->state = SS_UNCONNECTED; if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) return -ESOCKTNOSUPPORT; if (protocol < 0 || protocol >= MAX_LINKS) return -EPROTONOSUPPORT; protocol = array_index_nospec(protocol, MAX_LINKS); netlink_lock_table(); #ifdef CONFIG_MODULES if (!nl_table[protocol].registered) { netlink_unlock_table(); request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol); netlink_lock_table(); } #endif if (nl_table[protocol].registered && try_module_get(nl_table[protocol].module)) module = nl_table[protocol].module; else err = -EPROTONOSUPPORT; cb_mutex = nl_table[protocol].cb_mutex; bind = nl_table[protocol].bind; unbind = nl_table[protocol].unbind; netlink_unlock_table(); if (err < 0) goto out; err = __netlink_create(net, sock, cb_mutex, protocol, kern); if (err < 0) goto out_module; local_bh_disable(); sock_prot_inuse_add(net, &netlink_proto, 1); local_bh_enable(); nlk = nlk_sk(sock->sk); nlk->module = module; nlk->netlink_bind = bind; nlk->netlink_unbind = unbind; out: return err; out_module: module_put(module); goto out; } static void deferred_put_nlk_sk(struct rcu_head *head) { struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu); struct sock *sk = &nlk->sk; kfree(nlk->groups); nlk->groups = NULL; if (!refcount_dec_and_test(&sk->sk_refcnt)) return; if (nlk->cb_running && nlk->cb.done) { INIT_WORK(&nlk->work, netlink_sock_destruct_work); schedule_work(&nlk->work); return; } sk_free(sk); } static int netlink_release(struct socket *sock) { struct sock *sk = sock->sk; struct netlink_sock *nlk; if (!sk) return 0; netlink_remove(sk); sock_orphan(sk); nlk = nlk_sk(sk); /* * OK. Socket is unlinked, any packets that arrive now * will be purged. */ /* must not acquire netlink_table_lock in any way again before unbind * and notifying genetlink is done as otherwise it might deadlock */ if (nlk->netlink_unbind) { int i; for (i = 0; i < nlk->ngroups; i++) if (test_bit(i, nlk->groups)) nlk->netlink_unbind(sock_net(sk), i + 1); } if (sk->sk_protocol == NETLINK_GENERIC && atomic_dec_return(&genl_sk_destructing_cnt) == 0) wake_up(&genl_sk_destructing_waitq); sock->sk = NULL; wake_up_interruptible_all(&nlk->wait); skb_queue_purge(&sk->sk_write_queue); if (nlk->portid && nlk->bound) { struct netlink_notify n = { .net = sock_net(sk), .protocol = sk->sk_protocol, .portid = nlk->portid, }; blocking_notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); } module_put(nlk->module); if (netlink_is_kernel(sk)) { netlink_table_grab(); BUG_ON(nl_table[sk->sk_protocol].registered == 0); if (--nl_table[sk->sk_protocol].registered == 0) { struct listeners *old; old = nl_deref_protected(nl_table[sk->sk_protocol].listeners); RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL); kfree_rcu(old, rcu); nl_table[sk->sk_protocol].module = NULL; nl_table[sk->sk_protocol].bind = NULL; nl_table[sk->sk_protocol].unbind = NULL; nl_table[sk->sk_protocol].flags = 0; nl_table[sk->sk_protocol].registered = 0; } netlink_table_ungrab(); } local_bh_disable(); sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); local_bh_enable(); call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } static int netlink_autobind(struct socket *sock) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_table *table = &nl_table[sk->sk_protocol]; s32 portid = task_tgid_vnr(current); int err; s32 rover = -4096; bool ok; retry: cond_resched(); rcu_read_lock(); ok = !__netlink_lookup(table, portid, net); rcu_read_unlock(); if (!ok) { /* Bind collision, search negative portid values. */ if (rover == -4096) /* rover will be in range [S32_MIN, -4097] */ rover = S32_MIN + prandom_u32_max(-4096 - S32_MIN); else if (rover >= -4096) rover = -4097; portid = rover--; goto retry; } err = netlink_insert(sk, portid); if (err == -EADDRINUSE) goto retry; /* If 2 threads race to autobind, that is fine. */ if (err == -EBUSY) err = 0; return err; } /** * __netlink_ns_capable - General netlink message capability test * @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace. * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has has the capability @cap in the user namespace @user_ns. */ bool __netlink_ns_capable(const struct netlink_skb_parms *nsp, struct user_namespace *user_ns, int cap) { return ((nsp->flags & NETLINK_SKB_DST) || file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) && ns_capable(user_ns, cap); } EXPORT_SYMBOL(__netlink_ns_capable); /** * netlink_ns_capable - General netlink message capability test * @skb: socket buffer holding a netlink command from userspace * @user_ns: The user namespace of the capability to use * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has has the capability @cap in the user namespace @user_ns. */ bool netlink_ns_capable(const struct sk_buff *skb, struct user_namespace *user_ns, int cap) { return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap); } EXPORT_SYMBOL(netlink_ns_capable); /** * netlink_capable - Netlink global message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has has the capability @cap in all user namespaces. */ bool netlink_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, &init_user_ns, cap); } EXPORT_SYMBOL(netlink_capable); /** * netlink_net_capable - Netlink network namespace message capability test * @skb: socket buffer holding a netlink command from userspace * @cap: The capability to use * * Test to see if the opener of the socket we received the message * from had when the netlink socket was created and the sender of the * message has has the capability @cap over the network namespace of * the socket we received the message from. */ bool netlink_net_capable(const struct sk_buff *skb, int cap) { return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap); } EXPORT_SYMBOL(netlink_net_capable); static inline int netlink_allowed(const struct socket *sock, unsigned int flag) { return (nl_table[sock->sk->sk_protocol].flags & flag) || ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN); } static void netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions) { struct netlink_sock *nlk = nlk_sk(sk); if (nlk->subscriptions && !subscriptions) __sk_del_bind_node(sk); else if (!nlk->subscriptions && subscriptions) sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); nlk->subscriptions = subscriptions; } static int netlink_realloc_groups(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); unsigned int groups; unsigned long *new_groups; int err = 0; netlink_table_grab(); groups = nl_table[sk->sk_protocol].groups; if (!nl_table[sk->sk_protocol].registered) { err = -ENOENT; goto out_unlock; } if (nlk->ngroups >= groups) goto out_unlock; new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC); if (new_groups == NULL) { err = -ENOMEM; goto out_unlock; } memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0, NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups)); nlk->groups = new_groups; nlk->ngroups = groups; out_unlock: netlink_table_ungrab(); return err; } static void netlink_undo_bind(int group, long unsigned int groups, struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); int undo; if (!nlk->netlink_unbind) return; for (undo = 0; undo < group; undo++) if (test_bit(undo, &groups)) nlk->netlink_unbind(sock_net(sk), undo + 1); } static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; int err = 0; long unsigned int groups = nladdr->nl_groups; bool bound; if (addr_len < sizeof(struct sockaddr_nl)) return -EINVAL; if (nladdr->nl_family != AF_NETLINK) return -EINVAL; /* Only superuser is allowed to listen multicasts */ if (groups) { if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; } if (nlk->ngroups == 0) groups = 0; else if (nlk->ngroups < 8*sizeof(groups)) groups &= (1UL << nlk->ngroups) - 1; /* Paired with WRITE_ONCE() in netlink_insert() */ bound = READ_ONCE(nlk->bound); if (bound) { /* Ensure nlk->portid is up-to-date. */ smp_rmb(); if (nladdr->nl_pid != nlk->portid) return -EINVAL; } netlink_lock_table(); if (nlk->netlink_bind && groups) { int group; /* nl_groups is a u32, so cap the maximum groups we can bind */ for (group = 0; group < BITS_PER_TYPE(u32); group++) { if (!test_bit(group, &groups)) continue; err = nlk->netlink_bind(net, group + 1); if (!err) continue; netlink_undo_bind(group, groups, sk); goto unlock; } } /* No need for barriers here as we return to user-space without * using any of the bound attributes. */ if (!bound) { err = nladdr->nl_pid ? netlink_insert(sk, nladdr->nl_pid) : netlink_autobind(sock); if (err) { netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk); goto unlock; } } if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0])) goto unlock; netlink_unlock_table(); netlink_table_grab(); netlink_update_subscriptions(sk, nlk->subscriptions + hweight32(groups) - hweight32(nlk->groups[0])); nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups; netlink_update_listeners(sk); netlink_table_ungrab(); return 0; unlock: netlink_unlock_table(); return err; } static int netlink_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { int err = 0; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; if (alen < sizeof(addr->sa_family)) return -EINVAL; if (addr->sa_family == AF_UNSPEC) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, 0); WRITE_ONCE(nlk->dst_group, 0); return 0; } if (addr->sa_family != AF_NETLINK) return -EINVAL; if (alen < sizeof(struct sockaddr_nl)) return -EINVAL; if ((nladdr->nl_groups || nladdr->nl_pid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) return -EPERM; /* No need for barriers here as we return to user-space without * using any of the bound attributes. * Paired with WRITE_ONCE() in netlink_insert(). */ if (!READ_ONCE(nlk->bound)) err = netlink_autobind(sock); if (err == 0) { /* paired with READ_ONCE() in netlink_getsockbyportid() */ WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED); /* dst_portid and dst_group can be read locklessly */ WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid); WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups)); } return err; } static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr); nladdr->nl_family = AF_NETLINK; nladdr->nl_pad = 0; *addr_len = sizeof(*nladdr); if (peer) { /* Paired with WRITE_ONCE() in netlink_connect() */ nladdr->nl_pid = READ_ONCE(nlk->dst_portid); nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group)); } else { nladdr->nl_pid = nlk->portid; netlink_lock_table(); nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0; netlink_unlock_table(); } return 0; } static int netlink_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { /* try to hand this ioctl down to the NIC drivers. */ return -ENOIOCTLCMD; } static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid) { struct sock *sock; struct netlink_sock *nlk; sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid); if (!sock) return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ nlk = nlk_sk(sock); /* dst_portid and sk_state can be changed in netlink_connect() */ if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED && READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) { sock_put(sock); return ERR_PTR(-ECONNREFUSED); } return sock; } struct sock *netlink_getsockbyfilp(struct file *filp) { struct inode *inode = file_inode(filp); struct sock *sock; if (!S_ISSOCK(inode->i_mode)) return ERR_PTR(-ENOTSOCK); sock = SOCKET_I(inode)->sk; if (sock->sk_family != AF_NETLINK) return ERR_PTR(-EINVAL); sock_hold(sock); return sock; } static struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast) { struct sk_buff *skb; void *data; if (size <= NLMSG_GOODSIZE || broadcast) return alloc_skb(size, GFP_KERNEL); size = SKB_DATA_ALIGN(size) + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); data = vmalloc(size); if (data == NULL) return NULL; skb = __build_skb(data, size); if (skb == NULL) vfree(data); else skb->destructor = netlink_skb_destructor; return skb; } /* * Attach a skb to a netlink socket. * The caller must hold a reference to the destination socket. On error, the * reference is dropped. The skb is not send to the destination, just all * all error checks are performed and memory in the queue is reserved. * Return values: * < 0: error. skb freed, reference to sock dropped. * 0: continue * 1: repeat lookup - reference dropped while waiting for socket memory. */ int netlink_attachskb(struct sock *sk, struct sk_buff *skb, long *timeo, struct sock *ssk) { struct netlink_sock *nlk; nlk = nlk_sk(sk); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state))) { DECLARE_WAITQUEUE(wait, current); if (!*timeo) { if (!ssk || netlink_is_kernel(ssk)) netlink_overrun(sk); sock_put(sk); kfree_skb(skb); return -EAGAIN; } __set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&nlk->wait, &wait); if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(NETLINK_S_CONGESTED, &nlk->state)) && !sock_flag(sk, SOCK_DEAD)) *timeo = schedule_timeout(*timeo); __set_current_state(TASK_RUNNING); remove_wait_queue(&nlk->wait, &wait); sock_put(sk); if (signal_pending(current)) { kfree_skb(skb); return sock_intr_errno(*timeo); } return 1; } netlink_skb_set_owner_r(skb, sk); return 0; } static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = skb->len; netlink_deliver_tap(skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); return len; } int netlink_sendskb(struct sock *sk, struct sk_buff *skb) { int len = __netlink_sendskb(sk, skb); sock_put(sk); return len; } void netlink_detachskb(struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); sock_put(sk); } static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation) { int delta; WARN_ON(skb->sk != NULL); delta = skb->end - skb->tail; if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) return skb; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, allocation); if (!nskb) return skb; consume_skb(skb); skb = nskb; } pskb_expand_head(skb, 0, -delta, (allocation & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); return skb; } static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb, struct sock *ssk) { int ret; struct netlink_sock *nlk = nlk_sk(sk); ret = -ECONNREFUSED; if (nlk->netlink_rcv != NULL) { ret = skb->len; netlink_skb_set_owner_r(skb, sk); NETLINK_CB(skb).sk = ssk; netlink_deliver_tap_kernel(sk, ssk, skb); nlk->netlink_rcv(skb); consume_skb(skb); } else { kfree_skb(skb); } sock_put(sk); return ret; } int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 portid, int nonblock) { struct sock *sk; int err; long timeo; skb = netlink_trim(skb, gfp_any()); timeo = sock_sndtimeo(ssk, nonblock); retry: sk = netlink_getsockbyportid(ssk, portid); if (IS_ERR(sk)) { kfree_skb(skb); return PTR_ERR(sk); } if (netlink_is_kernel(sk)) return netlink_unicast_kernel(sk, skb, ssk); if (sk_filter(sk, skb)) { err = skb->len; kfree_skb(skb); sock_put(sk); return err; } err = netlink_attachskb(sk, skb, &timeo, ssk); if (err == 1) goto retry; if (err) return err; return netlink_sendskb(sk, skb); } EXPORT_SYMBOL(netlink_unicast); int netlink_has_listeners(struct sock *sk, unsigned int group) { int res = 0; struct listeners *listeners; BUG_ON(!netlink_is_kernel(sk)); rcu_read_lock(); listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners); if (listeners && group - 1 < nl_table[sk->sk_protocol].groups) res = test_bit(group - 1, listeners->masks); rcu_read_unlock(); return res; } EXPORT_SYMBOL_GPL(netlink_has_listeners); static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) { struct netlink_sock *nlk = nlk_sk(sk); if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(NETLINK_S_CONGESTED, &nlk->state)) { netlink_skb_set_owner_r(skb, sk); __netlink_sendskb(sk, skb); return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1); } return -1; } struct netlink_broadcast_data { struct sock *exclude_sk; struct net *net; u32 portid; u32 group; int failure; int delivery_failure; int congested; int delivered; gfp_t allocation; struct sk_buff *skb, *skb2; int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data); void *tx_data; }; static void do_one_broadcast(struct sock *sk, struct netlink_broadcast_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int val; if (p->exclude_sk == sk) return; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) return; if (!net_eq(sock_net(sk), p->net)) { if (!(nlk->flags & NETLINK_F_LISTEN_ALL_NSID)) return; if (!peernet_has_id(sock_net(sk), p->net)) return; if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns, CAP_NET_BROADCAST)) return; } if (p->failure) { netlink_overrun(sk); return; } sock_hold(sk); if (p->skb2 == NULL) { if (skb_shared(p->skb)) { p->skb2 = skb_clone(p->skb, p->allocation); } else { p->skb2 = skb_get(p->skb); /* * skb ownership may have been set when * delivered to a previous socket. */ skb_orphan(p->skb2); } } if (p->skb2 == NULL) { netlink_overrun(sk); /* Clone failed. Notify ALL listeners. */ p->failure = 1; if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; goto out; } if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } if (sk_filter(sk, p->skb2)) { kfree_skb(p->skb2); p->skb2 = NULL; goto out; } NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net); if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED) NETLINK_CB(p->skb2).nsid_is_set = true; val = netlink_broadcast_deliver(sk, p->skb2); if (val < 0) { netlink_overrun(sk); if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) p->delivery_failure = 1; } else { p->congested |= val; p->delivered = 1; p->skb2 = NULL; } out: sock_put(sk); } int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation, int (*filter)(struct sock *dsk, struct sk_buff *skb, void *data), void *filter_data) { struct net *net = sock_net(ssk); struct netlink_broadcast_data info; struct sock *sk; skb = netlink_trim(skb, allocation); info.exclude_sk = ssk; info.net = net; info.portid = portid; info.group = group; info.failure = 0; info.delivery_failure = 0; info.congested = 0; info.delivered = 0; info.allocation = allocation; info.skb = skb; info.skb2 = NULL; info.tx_filter = filter; info.tx_data = filter_data; /* While we sleep in clone, do not allow to change socket list */ netlink_lock_table(); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) do_one_broadcast(sk, &info); consume_skb(skb); netlink_unlock_table(); if (info.delivery_failure) { kfree_skb(info.skb2); return -ENOBUFS; } consume_skb(info.skb2); if (info.delivered) { if (info.congested && gfpflags_allow_blocking(allocation)) yield(); return 0; } return -ESRCH; } EXPORT_SYMBOL(netlink_broadcast_filtered); int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid, u32 group, gfp_t allocation) { return netlink_broadcast_filtered(ssk, skb, portid, group, allocation, NULL, NULL); } EXPORT_SYMBOL(netlink_broadcast); struct netlink_set_err_data { struct sock *exclude_sk; u32 portid; u32 group; int code; }; static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p) { struct netlink_sock *nlk = nlk_sk(sk); int ret = 0; if (sk == p->exclude_sk) goto out; if (!net_eq(sock_net(sk), sock_net(p->exclude_sk))) goto out; if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups || !test_bit(p->group - 1, nlk->groups)) goto out; if (p->code == ENOBUFS && nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) { ret = 1; goto out; } sk->sk_err = p->code; sk->sk_error_report(sk); out: return ret; } /** * netlink_set_err - report error to broadcast listeners * @ssk: the kernel netlink socket, as returned by netlink_kernel_create() * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_NO_ENOBUFS socket option. */ int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code) { struct netlink_set_err_data info; struct sock *sk; int ret = 0; info.exclude_sk = ssk; info.portid = portid; info.group = group; /* sk->sk_err wants a positive error value */ info.code = -code; read_lock(&nl_table_lock); sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list) ret += do_one_set_err(sk, &info); read_unlock(&nl_table_lock); return ret; } EXPORT_SYMBOL(netlink_set_err); /* must be called with netlink table grabbed */ static void netlink_update_socket_mc(struct netlink_sock *nlk, unsigned int group, int is_new) { int old, new = !!is_new, subscriptions; old = test_bit(group - 1, nlk->groups); subscriptions = nlk->subscriptions - old + new; if (new) __set_bit(group - 1, nlk->groups); else __clear_bit(group - 1, nlk->groups); netlink_update_subscriptions(&nlk->sk, subscriptions); netlink_update_listeners(&nlk->sk); } static int netlink_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); unsigned int val = 0; int err; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (optlen >= sizeof(int) && get_user(val, (unsigned int __user *)optval)) return -EFAULT; switch (optname) { case NETLINK_PKTINFO: if (val) nlk->flags |= NETLINK_F_RECV_PKTINFO; else nlk->flags &= ~NETLINK_F_RECV_PKTINFO; err = 0; break; case NETLINK_ADD_MEMBERSHIP: case NETLINK_DROP_MEMBERSHIP: { if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV)) return -EPERM; err = netlink_realloc_groups(sk); if (err) return err; if (!val || val - 1 >= nlk->ngroups) return -EINVAL; if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) { err = nlk->netlink_bind(sock_net(sk), val); if (err) return err; } netlink_table_grab(); netlink_update_socket_mc(nlk, val, optname == NETLINK_ADD_MEMBERSHIP); netlink_table_ungrab(); if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind) nlk->netlink_unbind(sock_net(sk), val); err = 0; break; } case NETLINK_BROADCAST_ERROR: if (val) nlk->flags |= NETLINK_F_BROADCAST_SEND_ERROR; else nlk->flags &= ~NETLINK_F_BROADCAST_SEND_ERROR; err = 0; break; case NETLINK_NO_ENOBUFS: if (val) { nlk->flags |= NETLINK_F_RECV_NO_ENOBUFS; clear_bit(NETLINK_S_CONGESTED, &nlk->state); wake_up_interruptible(&nlk->wait); } else { nlk->flags &= ~NETLINK_F_RECV_NO_ENOBUFS; } err = 0; break; case NETLINK_LISTEN_ALL_NSID: if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST)) return -EPERM; if (val) nlk->flags |= NETLINK_F_LISTEN_ALL_NSID; else nlk->flags &= ~NETLINK_F_LISTEN_ALL_NSID; err = 0; break; case NETLINK_CAP_ACK: if (val) nlk->flags |= NETLINK_F_CAP_ACK; else nlk->flags &= ~NETLINK_F_CAP_ACK; err = 0; break; case NETLINK_EXT_ACK: if (val) nlk->flags |= NETLINK_F_EXT_ACK; else nlk->flags &= ~NETLINK_F_EXT_ACK; err = 0; break; default: err = -ENOPROTOOPT; } return err; } static int netlink_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); int len, val, err; if (level != SOL_NETLINK) return -ENOPROTOOPT; if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; switch (optname) { case NETLINK_PKTINFO: if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = nlk->flags & NETLINK_F_RECV_PKTINFO ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; err = 0; break; case NETLINK_BROADCAST_ERROR: if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; err = 0; break; case NETLINK_NO_ENOBUFS: if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = nlk->flags & NETLINK_F_RECV_NO_ENOBUFS ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; err = 0; break; case NETLINK_LIST_MEMBERSHIPS: { int pos, idx, shift; err = 0; netlink_lock_table(); for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) { if (len - pos < sizeof(u32)) break; idx = pos / sizeof(unsigned long); shift = (pos % sizeof(unsigned long)) * 8; if (put_user((u32)(nlk->groups[idx] >> shift), (u32 __user *)(optval + pos))) { err = -EFAULT; break; } } if (put_user(ALIGN(nlk->ngroups / 8, sizeof(u32)), optlen)) err = -EFAULT; netlink_unlock_table(); break; } case NETLINK_CAP_ACK: if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = nlk->flags & NETLINK_F_CAP_ACK ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; err = 0; break; case NETLINK_EXT_ACK: if (len < sizeof(int)) return -EINVAL; len = sizeof(int); val = nlk->flags & NETLINK_F_EXT_ACK ? 1 : 0; if (put_user(len, optlen) || put_user(val, optval)) return -EFAULT; err = 0; break; default: err = -ENOPROTOOPT; } return err; } static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) { struct nl_pktinfo info; info.group = NETLINK_CB(skb).dst_group; put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info); } static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) { if (!NETLINK_CB(skb).nsid_is_set) return; put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int), &NETLINK_CB(skb).nsid); } static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); u32 dst_portid; u32 dst_group; struct sk_buff *skb; int err; struct scm_cookie scm; u32 netlink_skb_flags = 0; if (msg->msg_flags&MSG_OOB) return -EOPNOTSUPP; if (len == 0) { pr_warn_once("Zero length message leads to an empty skb\n"); return -ENODATA; } err = scm_send(sock, msg, &scm, true); if (err < 0) return err; if (msg->msg_namelen) { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_nl)) goto out; if (addr->nl_family != AF_NETLINK) goto out; dst_portid = addr->nl_pid; dst_group = ffs(addr->nl_groups); err = -EPERM; if ((dst_group || dst_portid) && !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND)) goto out; netlink_skb_flags |= NETLINK_SKB_DST; } else { /* Paired with WRITE_ONCE() in netlink_connect() */ dst_portid = READ_ONCE(nlk->dst_portid); dst_group = READ_ONCE(nlk->dst_group); } /* Paired with WRITE_ONCE() in netlink_insert() */ if (!READ_ONCE(nlk->bound)) { err = netlink_autobind(sock); if (err) goto out; } else { /* Ensure nlk is hashed and visible. */ smp_rmb(); } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = netlink_alloc_large_skb(len, dst_group); if (skb == NULL) goto out; NETLINK_CB(skb).portid = nlk->portid; NETLINK_CB(skb).dst_group = dst_group; NETLINK_CB(skb).creds = scm.creds; NETLINK_CB(skb).flags = netlink_skb_flags; err = -EFAULT; if (memcpy_from_msg(skb_put(skb, len), msg, len)) { kfree_skb(skb); goto out; } err = security_netlink_send(sk, skb); if (err) { kfree_skb(skb); goto out; } if (dst_group) { refcount_inc(&skb->users); netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL); } err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags&MSG_DONTWAIT); out: scm_destroy(&scm); return err; } static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); int noblock = flags&MSG_DONTWAIT; size_t copied; struct sk_buff *skb, *data_skb; int err, ret; if (flags&MSG_OOB) return -EOPNOTSUPP; copied = 0; skb = skb_recv_datagram(sk, flags, noblock, &err); if (skb == NULL) goto out; data_skb = skb; #ifdef CONFIG_COMPAT_NETLINK_MESSAGES if (unlikely(skb_shinfo(skb)->frag_list)) { /* * If this skb has a frag_list, then here that means that we * will have to use the frag_list skb's data for compat tasks * and the regular skb's data for normal (non-compat) tasks. * * If we need to send the compat skb, assign it to the * 'data_skb' variable so that it will be used below for data * copying. We keep 'skb' for everything else, including * freeing both later. */ if (flags & MSG_CMSG_COMPAT) data_skb = skb_shinfo(skb)->frag_list; } #endif /* Record the max length of recvmsg() calls for future allocations */ nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, SKB_WITH_OVERHEAD(32768)); copied = data_skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); addr->nl_family = AF_NETLINK; addr->nl_pad = 0; addr->nl_pid = NETLINK_CB(skb).portid; addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group); msg->msg_namelen = sizeof(*addr); } if (nlk->flags & NETLINK_F_RECV_PKTINFO) netlink_cmsg_recv_pktinfo(msg, skb); if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) netlink_cmsg_listen_all_nsid(sk, msg, skb); memset(&scm, 0, sizeof(scm)); scm.creds = *NETLINK_CREDS(skb); if (flags & MSG_TRUNC) copied = data_skb->len; skb_free_datagram(sk, skb); if (nlk->cb_running && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { ret = netlink_dump(sk); if (ret) { sk->sk_err = -ret; sk->sk_error_report(sk); } } scm_recv(sock, msg, &scm, flags); out: netlink_rcv_wake(sk); return err ? : copied; } static void netlink_data_ready(struct sock *sk) { BUG(); } /* * We export these functions to other modules. They provide a * complete set of kernel non-blocking support for message * queueing. */ struct sock * __netlink_kernel_create(struct net *net, int unit, struct module *module, struct netlink_kernel_cfg *cfg) { struct socket *sock; struct sock *sk; struct netlink_sock *nlk; struct listeners *listeners = NULL; struct mutex *cb_mutex = cfg ? cfg->cb_mutex : NULL; unsigned int groups; BUG_ON(!nl_table); if (unit < 0 || unit >= MAX_LINKS) return NULL; if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) return NULL; if (__netlink_create(net, sock, cb_mutex, unit, 1) < 0) goto out_sock_release_nosk; sk = sock->sk; if (!cfg || cfg->groups < 32) groups = 32; else groups = cfg->groups; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) goto out_sock_release; sk->sk_data_ready = netlink_data_ready; if (cfg && cfg->input) nlk_sk(sk)->netlink_rcv = cfg->input; if (netlink_insert(sk, 0)) goto out_sock_release; nlk = nlk_sk(sk); nlk->flags |= NETLINK_F_KERNEL_SOCKET; netlink_table_grab(); if (!nl_table[unit].registered) { nl_table[unit].groups = groups; rcu_assign_pointer(nl_table[unit].listeners, listeners); nl_table[unit].cb_mutex = cb_mutex; nl_table[unit].module = module; if (cfg) { nl_table[unit].bind = cfg->bind; nl_table[unit].unbind = cfg->unbind; nl_table[unit].flags = cfg->flags; if (cfg->compare) nl_table[unit].compare = cfg->compare; } nl_table[unit].registered = 1; } else { kfree(listeners); nl_table[unit].registered++; } netlink_table_ungrab(); return sk; out_sock_release: kfree(listeners); netlink_kernel_release(sk); return NULL; out_sock_release_nosk: sock_release(sock); return NULL; } EXPORT_SYMBOL(__netlink_kernel_create); void netlink_kernel_release(struct sock *sk) { if (sk == NULL || sk->sk_socket == NULL) return; sock_release(sk->sk_socket); } EXPORT_SYMBOL(netlink_kernel_release); int __netlink_change_ngroups(struct sock *sk, unsigned int groups) { struct listeners *new, *old; struct netlink_table *tbl = &nl_table[sk->sk_protocol]; if (groups < 32) groups = 32; if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) { new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC); if (!new) return -ENOMEM; old = nl_deref_protected(tbl->listeners); memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups)); rcu_assign_pointer(tbl->listeners, new); kfree_rcu(old, rcu); } tbl->groups = groups; return 0; } /** * netlink_change_ngroups - change number of multicast groups * * This changes the number of multicast groups that are available * on a certain netlink family. Note that it is not possible to * change the number of groups to below 32. Also note that it does * not implicitly call netlink_clear_multicast_users() when the * number of groups is reduced. * * @sk: The kernel netlink socket, as returned by netlink_kernel_create(). * @groups: The new number of groups. */ int netlink_change_ngroups(struct sock *sk, unsigned int groups) { int err; netlink_table_grab(); err = __netlink_change_ngroups(sk, groups); netlink_table_ungrab(); return err; } void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group) { struct sock *sk; struct netlink_table *tbl = &nl_table[ksk->sk_protocol]; sk_for_each_bound(sk, &tbl->mc_list) netlink_update_socket_mc(nlk_sk(sk), group, 0); } struct nlmsghdr * __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags) { struct nlmsghdr *nlh; int size = nlmsg_msg_size(len); nlh = skb_put(skb, NLMSG_ALIGN(size)); nlh->nlmsg_type = type; nlh->nlmsg_len = size; nlh->nlmsg_flags = flags; nlh->nlmsg_pid = portid; nlh->nlmsg_seq = seq; if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0) memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size); return nlh; } EXPORT_SYMBOL(__nlmsg_put); /* * It looks a bit ugly. * It would be better to create kernel thread. */ static int netlink_dump(struct sock *sk) { struct netlink_sock *nlk = nlk_sk(sk); struct netlink_callback *cb; struct sk_buff *skb = NULL; struct nlmsghdr *nlh; struct module *module; int err = -ENOBUFS; int alloc_min_size; int alloc_size; mutex_lock(nlk->cb_mutex); if (!nlk->cb_running) { err = -EINVAL; goto errout_skb; } if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto errout_skb; /* NLMSG_GOODSIZE is small to avoid high order allocations being * required, but it makes sense to _attempt_ a 16K bytes allocation * to reduce number of system calls on dump operations, if user * ever provided a big enough buffer. */ cb = &nlk->cb; alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); if (alloc_min_size < nlk->max_recvmsg_len) { alloc_size = nlk->max_recvmsg_len; skb = alloc_skb(alloc_size, (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); } if (!skb) { alloc_size = alloc_min_size; skb = alloc_skb(alloc_size, GFP_KERNEL); } if (!skb) goto errout_skb; /* Trim skb to allocated size. User is expected to provide buffer as * large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at * netlink_recvmsg())). dump will pack as many smaller messages as * could fit within the allocated skb. skb is typically allocated * with larger space than required (could be as much as near 2x the * requested size with align to next power of 2 approach). Allowing * dump to use the excess space makes it difficult for a user to have a * reasonable static buffer based on the expected largest dump of a * single netdev. The outcome is MSG_TRUNC error. */ skb_reserve(skb, skb_tailroom(skb) - alloc_size); /* Make sure malicious BPF programs can not read unitialized memory * from skb->head -> skb->data */ skb_reset_network_header(skb); skb_reset_mac_header(skb); netlink_skb_set_owner_r(skb, sk); if (nlk->dump_done_errno > 0) nlk->dump_done_errno = cb->dump(skb, cb); if (nlk->dump_done_errno > 0 || skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) { mutex_unlock(nlk->cb_mutex); if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); return 0; } nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno), NLM_F_MULTI); if (WARN_ON(!nlh)) goto errout_skb; nl_dump_check_consistent(cb, nlh); memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno)); if (sk_filter(sk, skb)) kfree_skb(skb); else __netlink_sendskb(sk, skb); if (cb->done) cb->done(cb); nlk->cb_running = false; module = cb->module; skb = cb->skb; mutex_unlock(nlk->cb_mutex); module_put(module); consume_skb(skb); return 0; errout_skb: mutex_unlock(nlk->cb_mutex); kfree_skb(skb); return err; } int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { struct netlink_callback *cb; struct sock *sk; struct netlink_sock *nlk; int ret; refcount_inc(&skb->users); sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); if (sk == NULL) { ret = -ECONNREFUSED; goto error_free; } nlk = nlk_sk(sk); mutex_lock(nlk->cb_mutex); /* A dump is in progress... */ if (nlk->cb_running) { ret = -EBUSY; goto error_unlock; } /* add reference of module which cb->dump belongs to */ if (!try_module_get(control->module)) { ret = -EPROTONOSUPPORT; goto error_unlock; } cb = &nlk->cb; memset(cb, 0, sizeof(*cb)); cb->start = control->start; cb->dump = control->dump; cb->done = control->done; cb->nlh = nlh; cb->data = control->data; cb->module = control->module; cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; if (cb->start) { ret = cb->start(cb); if (ret) goto error_put; } nlk->cb_running = true; nlk->dump_done_errno = INT_MAX; mutex_unlock(nlk->cb_mutex); ret = netlink_dump(sk); sock_put(sk); if (ret) return ret; /* We successfully started a dump, by returning -EINTR we * signal not to send ACK even if it was requested. */ return -EINTR; error_put: module_put(control->module); error_unlock: sock_put(sk); mutex_unlock(nlk->cb_mutex); error_free: kfree_skb(skb); return ret; } EXPORT_SYMBOL(__netlink_dump_start); void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, const struct netlink_ext_ack *extack) { struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); size_t tlvlen = 0; struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); unsigned int flags = 0; bool nlk_has_extack = nlk->flags & NETLINK_F_EXT_ACK; /* Error messages get the original request appened, unless the user * requests to cap the error message, and get extra error data if * requested. */ if (err) { if (!(nlk->flags & NETLINK_F_CAP_ACK)) payload += nlmsg_len(nlh); else flags |= NLM_F_CAPPED; if (nlk_has_extack && extack) { if (extack->_msg) tlvlen += nla_total_size(strlen(extack->_msg) + 1); if (extack->bad_attr) tlvlen += nla_total_size(sizeof(u32)); } } else { flags |= NLM_F_CAPPED; if (nlk_has_extack && extack && extack->cookie_len) tlvlen += nla_total_size(extack->cookie_len); } if (tlvlen) flags |= NLM_F_ACK_TLVS; skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) { struct sock *sk; sk = netlink_lookup(sock_net(in_skb->sk), in_skb->sk->sk_protocol, NETLINK_CB(in_skb).portid); if (sk) { sk->sk_err = ENOBUFS; sk->sk_error_report(sk); sock_put(sk); } return; } rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, NLMSG_ERROR, payload, flags); errmsg = nlmsg_data(rep); errmsg->error = err; memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); if (nlk_has_extack && extack) { if (err) { if (extack->_msg) WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg)); if (extack->bad_attr && !WARN_ON((u8 *)extack->bad_attr < in_skb->data || (u8 *)extack->bad_attr >= in_skb->data + in_skb->len)) WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, (u8 *)extack->bad_attr - (u8 *)nlh)); } else { if (extack->cookie_len) WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, extack->cookie_len, extack->cookie)); } } nlmsg_end(skb, rep); netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)) { struct netlink_ext_ack extack; struct nlmsghdr *nlh; int err; while (skb->len >= nlmsg_total_size(0)) { int msglen; memset(&extack, 0, sizeof(extack)); nlh = nlmsg_hdr(skb); err = 0; if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len) return 0; /* Only requests are handled by the kernel */ if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) goto ack; /* Skip control messages */ if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) msglen = skb->len; skb_pull(skb, msglen); } return 0; } EXPORT_SYMBOL(netlink_rcv_skb); /** * nlmsg_notify - send a notification netlink message * @sk: netlink socket to use * @skb: notification message * @portid: destination netlink portid for reports or 0 * @group: destination multicast group or 0 * @report: 1 to report back, 0 to disable * @flags: allocation flags */ int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags) { int err = 0; if (group) { int exclude_portid = 0; if (report) { refcount_inc(&skb->users); exclude_portid = portid; } /* errors reported via destination sk->sk_err, but propagate * delivery errors if NETLINK_BROADCAST_ERROR flag is set */ err = nlmsg_multicast(sk, skb, exclude_portid, group, flags); if (err == -ESRCH) err = 0; } if (report) { int err2; err2 = nlmsg_unicast(sk, skb, portid); if (!err) err = err2; } return err; } EXPORT_SYMBOL(nlmsg_notify); #ifdef CONFIG_PROC_FS struct nl_seq_iter { struct seq_net_private p; struct rhashtable_iter hti; int link; }; static int netlink_walk_start(struct nl_seq_iter *iter) { int err; err = rhashtable_walk_init(&nl_table[iter->link].hash, &iter->hti, GFP_KERNEL); if (err) { iter->link = MAX_LINKS; return err; } err = rhashtable_walk_start(&iter->hti); return err == -EAGAIN ? 0 : err; } static void netlink_walk_stop(struct nl_seq_iter *iter) { rhashtable_walk_stop(&iter->hti); rhashtable_walk_exit(&iter->hti); } static void *__netlink_seq_next(struct seq_file *seq) { struct nl_seq_iter *iter = seq->private; struct netlink_sock *nlk; do { for (;;) { int err; nlk = rhashtable_walk_next(&iter->hti); if (IS_ERR(nlk)) { if (PTR_ERR(nlk) == -EAGAIN) continue; return nlk; } if (nlk) break; netlink_walk_stop(iter); if (++iter->link >= MAX_LINKS) return NULL; err = netlink_walk_start(iter); if (err) return ERR_PTR(err); } } while (sock_net(&nlk->sk) != seq_file_net(seq)); return nlk; } static void *netlink_seq_start(struct seq_file *seq, loff_t *posp) { struct nl_seq_iter *iter = seq->private; void *obj = SEQ_START_TOKEN; loff_t pos; int err; iter->link = 0; err = netlink_walk_start(iter); if (err) return ERR_PTR(err); for (pos = *posp; pos && obj && !IS_ERR(obj); pos--) obj = __netlink_seq_next(seq); return obj; } static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; return __netlink_seq_next(seq); } static void netlink_seq_stop(struct seq_file *seq, void *v) { struct nl_seq_iter *iter = seq->private; if (iter->link >= MAX_LINKS) return; netlink_walk_stop(iter); } static int netlink_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "sk Eth Pid Groups " "Rmem Wmem Dump Locks Drops Inode\n"); } else { struct sock *s = v; struct netlink_sock *nlk = nlk_sk(s); seq_printf(seq, "%pK %-3d %-6u %08x %-8d %-8d %d %-8d %-8d %-8lu\n", s, s->sk_protocol, nlk->portid, nlk->groups ? (u32)nlk->groups[0] : 0, sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), nlk->cb_running, refcount_read(&s->sk_refcnt), atomic_read(&s->sk_drops), sock_i_ino(s) ); } return 0; } static const struct seq_operations netlink_seq_ops = { .start = netlink_seq_start, .next = netlink_seq_next, .stop = netlink_seq_stop, .show = netlink_seq_show, }; static int netlink_seq_open(struct inode *inode, struct file *file) { return seq_open_net(inode, file, &netlink_seq_ops, sizeof(struct nl_seq_iter)); } static const struct file_operations netlink_seq_fops = { .owner = THIS_MODULE, .open = netlink_seq_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release_net, }; #endif int netlink_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_register_notifier); int netlink_unregister_notifier(struct notifier_block *nb) { return blocking_notifier_chain_unregister(&netlink_chain, nb); } EXPORT_SYMBOL(netlink_unregister_notifier); static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, .bind = netlink_bind, .connect = netlink_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = netlink_getname, .poll = datagram_poll, .ioctl = netlink_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = netlink_setsockopt, .getsockopt = netlink_getsockopt, .sendmsg = netlink_sendmsg, .recvmsg = netlink_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; static const struct net_proto_family netlink_family_ops = { .family = PF_NETLINK, .create = netlink_create, .owner = THIS_MODULE, /* for consistency 8) */ }; static int __net_init netlink_net_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create("netlink", 0, net->proc_net, &netlink_seq_fops)) return -ENOMEM; #endif return 0; } static void __net_exit netlink_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("netlink", net->proc_net); #endif } static void __init netlink_add_usersock_entry(void) { struct listeners *listeners; int groups = 32; listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL); if (!listeners) panic("netlink_add_usersock_entry: Cannot allocate listeners\n"); netlink_table_grab(); nl_table[NETLINK_USERSOCK].groups = groups; rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners); nl_table[NETLINK_USERSOCK].module = THIS_MODULE; nl_table[NETLINK_USERSOCK].registered = 1; nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND; netlink_table_ungrab(); } static struct pernet_operations __net_initdata netlink_net_ops = { .init = netlink_net_init, .exit = netlink_net_exit, }; static inline u32 netlink_hash(const void *data, u32 len, u32 seed) { const struct netlink_sock *nlk = data; struct netlink_compare_arg arg; netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid); return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed); } static const struct rhashtable_params netlink_rhashtable_params = { .head_offset = offsetof(struct netlink_sock, node), .key_len = netlink_compare_arg_len, .obj_hashfn = netlink_hash, .obj_cmpfn = netlink_compare, .automatic_shrinking = true, }; static int __init netlink_proto_init(void) { int i; int err = proto_register(&netlink_proto, 0); if (err != 0) goto out; BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb)); nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL); if (!nl_table) goto panic; for (i = 0; i < MAX_LINKS; i++) { if (rhashtable_init(&nl_table[i].hash, &netlink_rhashtable_params) < 0) { while (--i > 0) rhashtable_destroy(&nl_table[i].hash); kfree(nl_table); goto panic; } } INIT_LIST_HEAD(&netlink_tap_all); netlink_add_usersock_entry(); sock_register(&netlink_family_ops); register_pernet_subsys(&netlink_net_ops); /* The netlink device handler may be needed early. */ rtnetlink_init(); out: return err; panic: panic("netlink_init: Cannot allocate nl_table\n"); } core_initcall(netlink_proto_init);
38 38 37 26 12 12 12 168 168 168 54 54 54 54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 // SPDX-License-Identifier: GPL-2.0 #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <net/gro_cells.h> struct gro_cell { struct sk_buff_head napi_skbs; struct napi_struct napi; }; int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct gro_cell *cell; int res; rcu_read_lock(); if (unlikely(!(dev->flags & IFF_UP))) goto drop; if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) { res = netif_rx(skb); goto unlock; } cell = this_cpu_ptr(gcells->cells); if (skb_queue_len(&cell->napi_skbs) > netdev_max_backlog) { drop: atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); res = NET_RX_DROP; goto unlock; } __skb_queue_tail(&cell->napi_skbs, skb); if (skb_queue_len(&cell->napi_skbs) == 1) napi_schedule(&cell->napi); res = NET_RX_SUCCESS; unlock: rcu_read_unlock(); return res; } EXPORT_SYMBOL(gro_cells_receive); /* called under BH context */ static int gro_cell_poll(struct napi_struct *napi, int budget) { struct gro_cell *cell = container_of(napi, struct gro_cell, napi); struct sk_buff *skb; int work_done = 0; while (work_done < budget) { skb = __skb_dequeue(&cell->napi_skbs); if (!skb) break; napi_gro_receive(napi, skb); work_done++; } if (work_done < budget) napi_complete_done(napi, work_done); return work_done; } int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) { int i; gcells->cells = alloc_percpu(struct gro_cell); if (!gcells->cells) return -ENOMEM; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); __skb_queue_head_init(&cell->napi_skbs); set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); netif_napi_add(dev, &cell->napi, gro_cell_poll, NAPI_POLL_WEIGHT); napi_enable(&cell->napi); } return 0; } EXPORT_SYMBOL(gro_cells_init); void gro_cells_destroy(struct gro_cells *gcells) { int i; if (!gcells->cells) return; for_each_possible_cpu(i) { struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); napi_disable(&cell->napi); netif_napi_del(&cell->napi); __skb_queue_purge(&cell->napi_skbs); } free_percpu(gcells->cells); gcells->cells = NULL; } EXPORT_SYMBOL(gro_cells_destroy);
13 12 12 3 3 7 12 2 8 8 8 8 12 12 12 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/highmem.h> #include <linux/gfp.h> #include <linux/cpu.h> #include <linux/export.h> #include "rds.h" struct rds_page_remainder { struct page *r_page; unsigned long r_offset; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); /** * rds_page_remainder_alloc - build up regions of a message. * * @scat: Scatter list for message * @bytes: the number of bytes needed. * @gfp: the waiting behaviour of the allocation * * @gfp is always ored with __GFP_HIGHMEM. Callers must be prepared to * kmap the pages, etc. * * If @bytes is at least a full page then this just returns a page from * alloc_page(). * * If @bytes is a partial page then this stores the unused region of the * page in a per-cpu structure. Future partial-page allocations may be * satisfied from that cached region. This lets us waste less memory on * small allocations with minimal complexity. It works because the transmit * path passes read-only page regions down to devices. They hold a page * reference until they are done with the region. */ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, gfp_t gfp) { struct rds_page_remainder *rem; unsigned long flags; struct page *page; int ret; gfp |= __GFP_HIGHMEM; /* jump straight to allocation if we're trying for a huge page */ if (bytes >= PAGE_SIZE) { page = alloc_page(gfp); if (!page) { ret = -ENOMEM; } else { sg_set_page(scat, page, PAGE_SIZE, 0); ret = 0; } goto out; } rem = &per_cpu(rds_page_remainders, get_cpu()); local_irq_save(flags); while (1) { /* avoid a tiny region getting stuck by tossing it */ if (rem->r_page && bytes > (PAGE_SIZE - rem->r_offset)) { rds_stats_inc(s_page_remainder_miss); __free_page(rem->r_page); rem->r_page = NULL; } /* hand out a fragment from the cached page */ if (rem->r_page && bytes <= (PAGE_SIZE - rem->r_offset)) { sg_set_page(scat, rem->r_page, bytes, rem->r_offset); get_page(sg_page(scat)); if (rem->r_offset != 0) rds_stats_inc(s_page_remainder_hit); rem->r_offset += ALIGN(bytes, 8); if (rem->r_offset >= PAGE_SIZE) { __free_page(rem->r_page); rem->r_page = NULL; } ret = 0; break; } /* alloc if there is nothing for us to use */ local_irq_restore(flags); put_cpu(); page = alloc_page(gfp); rem = &per_cpu(rds_page_remainders, get_cpu()); local_irq_save(flags); if (!page) { ret = -ENOMEM; break; } /* did someone race to fill the remainder before us? */ if (rem->r_page) { __free_page(page); continue; } /* otherwise install our page and loop around to alloc */ rem->r_page = page; rem->r_offset = 0; } local_irq_restore(flags); put_cpu(); out: rdsdebug("bytes %lu ret %d %p %u %u\n", bytes, ret, ret ? NULL : sg_page(scat), ret ? 0 : scat->offset, ret ? 0 : scat->length); return ret; } EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); void rds_page_exit(void) { unsigned int cpu; for_each_possible_cpu(cpu) { struct rds_page_remainder *rem; rem = &per_cpu(rds_page_remainders, cpu); rdsdebug("cpu %u\n", cpu); if (rem->r_page) __free_page(rem->r_page); rem->r_page = NULL; } }
90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 /* netfs cookie management * * Copyright (C) 2004-2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * See Documentation/filesystems/caching/netfs-api.txt for more information on * the netfs API. */ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> #include <linux/slab.h> #include "internal.h" struct kmem_cache *fscache_cookie_jar; static atomic_t fscache_object_debug_id = ATOMIC_INIT(0); static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie); static int fscache_alloc_object(struct fscache_cache *cache, struct fscache_cookie *cookie); static int fscache_attach_object(struct fscache_cookie *cookie, struct fscache_object *object); /* * initialise an cookie jar slab element prior to any use */ void fscache_cookie_init_once(void *_cookie) { struct fscache_cookie *cookie = _cookie; memset(cookie, 0, sizeof(*cookie)); spin_lock_init(&cookie->lock); spin_lock_init(&cookie->stores_lock); INIT_HLIST_HEAD(&cookie->backing_objects); } /* * request a cookie to represent an object (index, datafile, xattr, etc) * - parent specifies the parent object * - the top level index cookie for each netfs is stored in the fscache_netfs * struct upon registration * - def points to the definition * - the netfs_data will be passed to the functions pointed to in *def * - all attached caches will be searched to see if they contain this object * - index objects aren't stored on disk until there's a dependent file that * needs storing * - other objects are stored in a selected cache immediately, and all the * indices forming the path to it are instantiated if necessary * - we never let on to the netfs about errors * - we may set a negative cookie pointer, but that's okay */ struct fscache_cookie *__fscache_acquire_cookie( struct fscache_cookie *parent, const struct fscache_cookie_def *def, void *netfs_data, bool enable) { struct fscache_cookie *cookie; BUG_ON(!def); _enter("{%s},{%s},%p,%u", parent ? (char *) parent->def->name : "<no-parent>", def->name, netfs_data, enable); fscache_stat(&fscache_n_acquires); /* if there's no parent cookie, then we don't create one here either */ if (!parent) { fscache_stat(&fscache_n_acquires_null); _leave(" [no parent]"); return NULL; } /* validate the definition */ BUG_ON(!def->get_key); BUG_ON(!def->name[0]); BUG_ON(def->type == FSCACHE_COOKIE_TYPE_INDEX && parent->def->type != FSCACHE_COOKIE_TYPE_INDEX); /* allocate and initialise a cookie */ cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL); if (!cookie) { fscache_stat(&fscache_n_acquires_oom); _leave(" [ENOMEM]"); return NULL; } atomic_set(&cookie->usage, 1); atomic_set(&cookie->n_children, 0); /* We keep the active count elevated until relinquishment to prevent an * attempt to wake up every time the object operations queue quiesces. */ atomic_set(&cookie->n_active, 1); atomic_inc(&parent->usage); atomic_inc(&parent->n_children); cookie->def = def; cookie->parent = parent; cookie->netfs_data = netfs_data; cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET); /* radix tree insertion won't use the preallocation pool unless it's * told it may not wait */ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); switch (cookie->def->type) { case FSCACHE_COOKIE_TYPE_INDEX: fscache_stat(&fscache_n_cookie_index); break; case FSCACHE_COOKIE_TYPE_DATAFILE: fscache_stat(&fscache_n_cookie_data); break; default: fscache_stat(&fscache_n_cookie_special); break; } if (enable) { /* if the object is an index then we need do nothing more here * - we create indices on disk when we need them as an index * may exist in multiple caches */ if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { if (fscache_acquire_non_index_cookie(cookie) == 0) { set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } else { atomic_dec(&parent->n_children); __fscache_cookie_put(cookie); fscache_stat(&fscache_n_acquires_nobufs); _leave(" = NULL"); return NULL; } } else { set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } } fscache_stat(&fscache_n_acquires_ok); _leave(" = %p", cookie); return cookie; } EXPORT_SYMBOL(__fscache_acquire_cookie); /* * Enable a cookie to permit it to accept new operations. */ void __fscache_enable_cookie(struct fscache_cookie *cookie, bool (*can_enable)(void *data), void *data) { _enter("%p", cookie); wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, TASK_UNINTERRUPTIBLE); if (test_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock; if (can_enable && !can_enable(data)) { /* The netfs decided it didn't want to enable after all */ } else if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) { /* Wait for outstanding disablement to complete */ __fscache_wait_on_invalidate(cookie); if (fscache_acquire_non_index_cookie(cookie) == 0) set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } else { set_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags); } out_unlock: clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); } EXPORT_SYMBOL(__fscache_enable_cookie); /* * acquire a non-index cookie * - this must make sure the index chain is instantiated and instantiate the * object representation too */ static int fscache_acquire_non_index_cookie(struct fscache_cookie *cookie) { struct fscache_object *object; struct fscache_cache *cache; uint64_t i_size; int ret; _enter(""); set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); /* now we need to see whether the backing objects for this cookie yet * exist, if not there'll be nothing to search */ down_read(&fscache_addremove_sem); if (list_empty(&fscache_cache_list)) { up_read(&fscache_addremove_sem); _leave(" = 0 [no caches]"); return 0; } /* select a cache in which to store the object */ cache = fscache_select_cache_for_object(cookie->parent); if (!cache) { up_read(&fscache_addremove_sem); fscache_stat(&fscache_n_acquires_no_cache); _leave(" = -ENOMEDIUM [no cache]"); return -ENOMEDIUM; } _debug("cache %s", cache->tag->name); set_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags); /* ask the cache to allocate objects for this cookie and its parent * chain */ ret = fscache_alloc_object(cache, cookie); if (ret < 0) { up_read(&fscache_addremove_sem); _leave(" = %d", ret); return ret; } /* pass on how big the object we're caching is supposed to be */ cookie->def->get_attr(cookie->netfs_data, &i_size); spin_lock(&cookie->lock); if (hlist_empty(&cookie->backing_objects)) { spin_unlock(&cookie->lock); goto unavailable; } object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); fscache_set_store_limit(object, i_size); /* initiate the process of looking up all the objects in the chain * (done by fscache_initialise_object()) */ fscache_raise_event(object, FSCACHE_OBJECT_EV_NEW_CHILD); spin_unlock(&cookie->lock); /* we may be required to wait for lookup to complete at this point */ if (!fscache_defer_lookup) { _debug("non-deferred lookup %p", &cookie->flags); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_LOOKING_UP, TASK_UNINTERRUPTIBLE); _debug("complete"); if (test_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags)) goto unavailable; } up_read(&fscache_addremove_sem); _leave(" = 0 [deferred]"); return 0; unavailable: up_read(&fscache_addremove_sem); _leave(" = -ENOBUFS"); return -ENOBUFS; } /* * recursively allocate cache object records for a cookie/cache combination * - caller must be holding the addremove sem */ static int fscache_alloc_object(struct fscache_cache *cache, struct fscache_cookie *cookie) { struct fscache_object *object; int ret; _enter("%p,%p{%s}", cache, cookie, cookie->def->name); spin_lock(&cookie->lock); hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { if (object->cache == cache) goto object_already_extant; } spin_unlock(&cookie->lock); /* ask the cache to allocate an object (we may end up with duplicate * objects at this stage, but we sort that out later) */ fscache_stat(&fscache_n_cop_alloc_object); object = cache->ops->alloc_object(cache, cookie); fscache_stat_d(&fscache_n_cop_alloc_object); if (IS_ERR(object)) { fscache_stat(&fscache_n_object_no_alloc); ret = PTR_ERR(object); goto error; } fscache_stat(&fscache_n_object_alloc); object->debug_id = atomic_inc_return(&fscache_object_debug_id); _debug("ALLOC OBJ%x: %s {%lx}", object->debug_id, cookie->def->name, object->events); ret = fscache_alloc_object(cache, cookie->parent); if (ret < 0) goto error_put; /* only attach if we managed to allocate all we needed, otherwise * discard the object we just allocated and instead use the one * attached to the cookie */ if (fscache_attach_object(cookie, object) < 0) { fscache_stat(&fscache_n_cop_put_object); cache->ops->put_object(object); fscache_stat_d(&fscache_n_cop_put_object); } _leave(" = 0"); return 0; object_already_extant: ret = -ENOBUFS; if (fscache_object_is_dying(object) || fscache_cache_is_broken(object)) { spin_unlock(&cookie->lock); goto error; } spin_unlock(&cookie->lock); _leave(" = 0 [found]"); return 0; error_put: fscache_stat(&fscache_n_cop_put_object); cache->ops->put_object(object); fscache_stat_d(&fscache_n_cop_put_object); error: _leave(" = %d", ret); return ret; } /* * attach a cache object to a cookie */ static int fscache_attach_object(struct fscache_cookie *cookie, struct fscache_object *object) { struct fscache_object *p; struct fscache_cache *cache = object->cache; int ret; _enter("{%s},{OBJ%x}", cookie->def->name, object->debug_id); spin_lock(&cookie->lock); /* there may be multiple initial creations of this object, but we only * want one */ ret = -EEXIST; hlist_for_each_entry(p, &cookie->backing_objects, cookie_link) { if (p->cache == object->cache) { if (fscache_object_is_dying(p)) ret = -ENOBUFS; goto cant_attach_object; } } /* pin the parent object */ spin_lock_nested(&cookie->parent->lock, 1); hlist_for_each_entry(p, &cookie->parent->backing_objects, cookie_link) { if (p->cache == object->cache) { if (fscache_object_is_dying(p)) { ret = -ENOBUFS; spin_unlock(&cookie->parent->lock); goto cant_attach_object; } object->parent = p; spin_lock(&p->lock); p->n_children++; spin_unlock(&p->lock); break; } } spin_unlock(&cookie->parent->lock); /* attach to the cache's object list */ if (list_empty(&object->cache_link)) { spin_lock(&cache->object_list_lock); list_add(&object->cache_link, &cache->object_list); spin_unlock(&cache->object_list_lock); } /* attach to the cookie */ object->cookie = cookie; atomic_inc(&cookie->usage); hlist_add_head(&object->cookie_link, &cookie->backing_objects); fscache_objlist_add(object); ret = 0; cant_attach_object: spin_unlock(&cookie->lock); _leave(" = %d", ret); return ret; } /* * Invalidate an object. Callable with spinlocks held. */ void __fscache_invalidate(struct fscache_cookie *cookie) { struct fscache_object *object; _enter("{%s}", cookie->def->name); fscache_stat(&fscache_n_invalidates); /* Only permit invalidation of data files. Invalidating an index will * require the caller to release all its attachments to the tree rooted * there, and if it's doing that, it may as well just retire the * cookie. */ ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); /* We will be updating the cookie too. */ BUG_ON(!cookie->def->get_aux); /* If there's an object, we tell the object state machine to handle the * invalidation on our behalf, otherwise there's nothing to do. */ if (!hlist_empty(&cookie->backing_objects)) { spin_lock(&cookie->lock); if (fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects) && !test_and_set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) { object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); if (fscache_object_is_live(object)) fscache_raise_event( object, FSCACHE_OBJECT_EV_INVALIDATE); } spin_unlock(&cookie->lock); } _leave(""); } EXPORT_SYMBOL(__fscache_invalidate); /* * Wait for object invalidation to complete. */ void __fscache_wait_on_invalidate(struct fscache_cookie *cookie) { _enter("%p", cookie); wait_on_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING, TASK_UNINTERRUPTIBLE); _leave(""); } EXPORT_SYMBOL(__fscache_wait_on_invalidate); /* * update the index entries backing a cookie */ void __fscache_update_cookie(struct fscache_cookie *cookie) { struct fscache_object *object; fscache_stat(&fscache_n_updates); if (!cookie) { fscache_stat(&fscache_n_updates_null); _leave(" [no cookie]"); return; } _enter("{%s}", cookie->def->name); BUG_ON(!cookie->def->get_aux); spin_lock(&cookie->lock); if (fscache_cookie_enabled(cookie)) { /* update the index entry on disk in each cache backing this * cookie. */ hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { fscache_raise_event(object, FSCACHE_OBJECT_EV_UPDATE); } } spin_unlock(&cookie->lock); _leave(""); } EXPORT_SYMBOL(__fscache_update_cookie); /* * Disable a cookie to stop it from accepting new requests from the netfs. */ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) { struct fscache_object *object; bool awaken = false; _enter("%p,%u", cookie, invalidate); ASSERTCMP(atomic_read(&cookie->n_active), >, 0); if (atomic_read(&cookie->n_children) != 0) { pr_err("Cookie '%s' still has children\n", cookie->def->name); BUG(); } wait_on_bit_lock(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK, TASK_UNINTERRUPTIBLE); if (!test_and_clear_bit(FSCACHE_COOKIE_ENABLED, &cookie->flags)) goto out_unlock_enable; /* If the cookie is being invalidated, wait for that to complete first * so that we can reuse the flag. */ __fscache_wait_on_invalidate(cookie); /* Dispose of the backing objects */ set_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags); spin_lock(&cookie->lock); if (!hlist_empty(&cookie->backing_objects)) { hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { if (invalidate) set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); } } else { if (test_and_clear_bit(FSCACHE_COOKIE_INVALIDATING, &cookie->flags)) awaken = true; } spin_unlock(&cookie->lock); if (awaken) wake_up_bit(&cookie->flags, FSCACHE_COOKIE_INVALIDATING); /* Wait for cessation of activity requiring access to the netfs (when * n_active reaches 0). This makes sure outstanding reads and writes * have completed. */ if (!atomic_dec_and_test(&cookie->n_active)) wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, TASK_UNINTERRUPTIBLE); /* Make sure any pending writes are cancelled. */ if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) fscache_invalidate_writes(cookie); /* Reset the cookie state if it wasn't relinquished */ if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { atomic_inc(&cookie->n_active); set_bit(FSCACHE_COOKIE_NO_DATA_YET, &cookie->flags); } out_unlock_enable: clear_bit_unlock(FSCACHE_COOKIE_ENABLEMENT_LOCK, &cookie->flags); wake_up_bit(&cookie->flags, FSCACHE_COOKIE_ENABLEMENT_LOCK); _leave(""); } EXPORT_SYMBOL(__fscache_disable_cookie); /* * release a cookie back to the cache * - the object will be marked as recyclable on disk if retire is true * - all dependents of this cookie must have already been unregistered * (indices/files/pages) */ void __fscache_relinquish_cookie(struct fscache_cookie *cookie, bool retire) { fscache_stat(&fscache_n_relinquishes); if (retire) fscache_stat(&fscache_n_relinquishes_retire); if (!cookie) { fscache_stat(&fscache_n_relinquishes_null); _leave(" [no cookie]"); return; } _enter("%p{%s,%p,%d},%d", cookie, cookie->def->name, cookie->netfs_data, atomic_read(&cookie->n_active), retire); /* No further netfs-accessing operations on this cookie permitted */ set_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags); __fscache_disable_cookie(cookie, retire); /* Clear pointers back to the netfs */ cookie->netfs_data = NULL; cookie->def = NULL; BUG_ON(cookie->stores.rnode); if (cookie->parent) { ASSERTCMP(atomic_read(&cookie->parent->usage), >, 0); ASSERTCMP(atomic_read(&cookie->parent->n_children), >, 0); atomic_dec(&cookie->parent->n_children); } /* Dispose of the netfs's link to the cookie */ ASSERTCMP(atomic_read(&cookie->usage), >, 0); fscache_cookie_put(cookie); _leave(""); } EXPORT_SYMBOL(__fscache_relinquish_cookie); /* * destroy a cookie */ void __fscache_cookie_put(struct fscache_cookie *cookie) { struct fscache_cookie *parent; _enter("%p", cookie); for (;;) { _debug("FREE COOKIE %p", cookie); parent = cookie->parent; BUG_ON(!hlist_empty(&cookie->backing_objects)); kmem_cache_free(fscache_cookie_jar, cookie); if (!parent) break; cookie = parent; BUG_ON(atomic_read(&cookie->usage) <= 0); if (!atomic_dec_and_test(&cookie->usage)) break; } _leave(""); } /* * check the consistency between the netfs inode and the backing cache * * NOTE: it only serves no-index type */ int __fscache_check_consistency(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; bool wake_cookie = false; int ret; _enter("%p,", cookie); ASSERTCMP(cookie->def->type, ==, FSCACHE_COOKIE_TYPE_DATAFILE); if (fscache_wait_for_deferred_lookup(cookie) < 0) return -ERESTARTSYS; if (hlist_empty(&cookie->backing_objects)) return 0; op = kzalloc(sizeof(*op), GFP_NOIO | __GFP_NOMEMALLOC | __GFP_NORETRY); if (!op) return -ENOMEM; fscache_operation_init(op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); spin_lock(&cookie->lock); if (!fscache_cookie_enabled(cookie) || hlist_empty(&cookie->backing_objects)) goto inconsistent; object = hlist_entry(cookie->backing_objects.first, struct fscache_object, cookie_link); if (test_bit(FSCACHE_IOERROR, &object->cache->flags)) goto inconsistent; op->debug_id = atomic_inc_return(&fscache_op_debug_id); __fscache_use_cookie(cookie); if (fscache_submit_op(object, op) < 0) goto submit_failed; /* the work queue now carries its own ref on the object */ spin_unlock(&cookie->lock); ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); if (ret == 0) { /* ask the cache to honour the operation */ ret = object->cache->ops->check_consistency(op); fscache_op_complete(op, false); } else if (ret == -ENOBUFS) { ret = 0; } fscache_put_operation(op); _leave(" = %d", ret); return ret; submit_failed: wake_cookie = __fscache_unuse_cookie(cookie); inconsistent: spin_unlock(&cookie->lock); if (wake_cookie) __fscache_wake_unused_cookie(cookie); kfree(op); _leave(" = -ESTALE"); return -ESTALE; } EXPORT_SYMBOL(__fscache_check_consistency);
3 216 216 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 /* * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. * All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it would be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef __XFS_LOG_PRIV_H__ #define __XFS_LOG_PRIV_H__ struct xfs_buf; struct xlog; struct xlog_ticket; struct xfs_mount; struct xfs_log_callback; /* * Flags for log structure */ #define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */ #define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */ #define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being shutdown */ #define XLOG_TAIL_WARN 0x10 /* log tail verify warning issued */ /* * get client id from packed copy. * * this hack is here because the xlog_pack code copies four bytes * of xlog_op_header containing the fields oh_clientid, oh_flags * and oh_res2 into the packed copy. * * later on this four byte chunk is treated as an int and the * client id is pulled out. * * this has endian issues, of course. */ static inline uint xlog_get_client_id(__be32 i) { return be32_to_cpu(i) >> 24; } /* * In core log state */ #define XLOG_STATE_ACTIVE 0x0001 /* Current IC log being written to */ #define XLOG_STATE_WANT_SYNC 0x0002 /* Want to sync this iclog; no more writes */ #define XLOG_STATE_SYNCING 0x0004 /* This IC log is syncing */ #define XLOG_STATE_DONE_SYNC 0x0008 /* Done syncing to disk */ #define XLOG_STATE_DO_CALLBACK \ 0x0010 /* Process callback functions */ #define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ #define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ #define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ /* * Flags to log ticket */ #define XLOG_TIC_INITED 0x1 /* has been initialized */ #define XLOG_TIC_PERM_RESERV 0x2 /* permanent reservation */ #define XLOG_TIC_FLAGS \ { XLOG_TIC_INITED, "XLOG_TIC_INITED" }, \ { XLOG_TIC_PERM_RESERV, "XLOG_TIC_PERM_RESERV" } /* * Below are states for covering allocation transactions. * By covering, we mean changing the h_tail_lsn in the last on-disk * log write such that no allocation transactions will be re-done during * recovery after a system crash. Recovery starts at the last on-disk * log write. * * These states are used to insert dummy log entries to cover * space allocation transactions which can undo non-transactional changes * after a crash. Writes to a file with space * already allocated do not result in any transactions. Allocations * might include space beyond the EOF. So if we just push the EOF a * little, the last transaction for the file could contain the wrong * size. If there is no file system activity, after an allocation * transaction, and the system crashes, the allocation transaction * will get replayed and the file will be truncated. This could * be hours/days/... after the allocation occurred. * * The fix for this is to do two dummy transactions when the * system is idle. We need two dummy transaction because the h_tail_lsn * in the log record header needs to point beyond the last possible * non-dummy transaction. The first dummy changes the h_tail_lsn to * the first transaction before the dummy. The second dummy causes * h_tail_lsn to point to the first dummy. Recovery starts at h_tail_lsn. * * These dummy transactions get committed when everything * is idle (after there has been some activity). * * There are 5 states used to control this. * * IDLE -- no logging has been done on the file system or * we are done covering previous transactions. * NEED -- logging has occurred and we need a dummy transaction * when the log becomes idle. * DONE -- we were in the NEED state and have committed a dummy * transaction. * NEED2 -- we detected that a dummy transaction has gone to the * on disk log with no other transactions. * DONE2 -- we committed a dummy transaction when in the NEED2 state. * * There are two places where we switch states: * * 1.) In xfs_sync, when we detect an idle log and are in NEED or NEED2. * We commit the dummy transaction and switch to DONE or DONE2, * respectively. In all other states, we don't do anything. * * 2.) When we finish writing the on-disk log (xlog_state_clean_log). * * No matter what state we are in, if this isn't the dummy * transaction going out, the next state is NEED. * So, if we aren't in the DONE or DONE2 states, the next state * is NEED. We can't be finishing a write of the dummy record * unless it was committed and the state switched to DONE or DONE2. * * If we are in the DONE state and this was a write of the * dummy transaction, we move to NEED2. * * If we are in the DONE2 state and this was a write of the * dummy transaction, we move to IDLE. * * * Writing only one dummy transaction can get appended to * one file space allocation. When this happens, the log recovery * code replays the space allocation and a file could be truncated. * This is why we have the NEED2 and DONE2 states before going idle. */ #define XLOG_STATE_COVER_IDLE 0 #define XLOG_STATE_COVER_NEED 1 #define XLOG_STATE_COVER_DONE 2 #define XLOG_STATE_COVER_NEED2 3 #define XLOG_STATE_COVER_DONE2 4 #define XLOG_COVER_OPS 5 /* Ticket reservation region accounting */ #define XLOG_TIC_LEN_MAX 15 /* * Reservation region * As would be stored in xfs_log_iovec but without the i_addr which * we don't care about. */ typedef struct xlog_res { uint r_len; /* region length :4 */ uint r_type; /* region's transaction type :4 */ } xlog_res_t; typedef struct xlog_ticket { struct list_head t_queue; /* reserve/write queue */ struct task_struct *t_task; /* task that owns this ticket */ xlog_tid_t t_tid; /* transaction identifier : 4 */ atomic_t t_ref; /* ticket reference count : 4 */ int t_curr_res; /* current reservation in bytes : 4 */ int t_unit_res; /* unit reservation in bytes : 4 */ char t_ocnt; /* original count : 1 */ char t_cnt; /* current count : 1 */ char t_clientid; /* who does this belong to; : 1 */ char t_flags; /* properties of reservation : 1 */ /* reservation array fields */ uint t_res_num; /* num in array : 4 */ uint t_res_num_ophdrs; /* num op hdrs : 4 */ uint t_res_arr_sum; /* array sum : 4 */ uint t_res_o_flow; /* sum overflow : 4 */ xlog_res_t t_res_arr[XLOG_TIC_LEN_MAX]; /* array of res : 8 * 15 */ } xlog_ticket_t; /* * - A log record header is 512 bytes. There is plenty of room to grow the * xlog_rec_header_t into the reserved space. * - ic_data follows, so a write to disk can start at the beginning of * the iclog. * - ic_forcewait is used to implement synchronous forcing of the iclog to disk. * - ic_next is the pointer to the next iclog in the ring. * - ic_bp is a pointer to the buffer used to write this incore log to disk. * - ic_log is a pointer back to the global log structure. * - ic_callback is a linked list of callback function/argument pairs to be * called after an iclog finishes writing. * - ic_size is the full size of the header plus data. * - ic_offset is the current number of bytes written to in this iclog. * - ic_refcnt is bumped when someone is writing to the log. * - ic_state is the state of the iclog. * * Because of cacheline contention on large machines, we need to separate * various resources onto different cachelines. To start with, make the * structure cacheline aligned. The following fields can be contended on * by independent processes: * * - ic_callback_* * - ic_refcnt * - fields protected by the global l_icloglock * * so we need to ensure that these fields are located in separate cachelines. * We'll put all the read-only and l_icloglock fields in the first cacheline, * and move everything else out to subsequent cachelines. */ typedef struct xlog_in_core { wait_queue_head_t ic_force_wait; wait_queue_head_t ic_write_wait; struct xlog_in_core *ic_next; struct xlog_in_core *ic_prev; struct xfs_buf *ic_bp; struct xlog *ic_log; int ic_size; int ic_offset; int ic_bwritecnt; unsigned short ic_state; char *ic_datap; /* pointer to iclog data */ /* Callback structures need their own cacheline */ spinlock_t ic_callback_lock ____cacheline_aligned_in_smp; struct xfs_log_callback *ic_callback; struct xfs_log_callback **ic_callback_tail; /* reference counts need their own cacheline */ atomic_t ic_refcnt ____cacheline_aligned_in_smp; xlog_in_core_2_t *ic_data; #define ic_header ic_data->hic_header } xlog_in_core_t; /* * The CIL context is used to aggregate per-transaction details as well be * passed to the iclog for checkpoint post-commit processing. After being * passed to the iclog, another context needs to be allocated for tracking the * next set of transactions to be aggregated into a checkpoint. */ struct xfs_cil; struct xfs_cil_ctx { struct xfs_cil *cil; xfs_lsn_t sequence; /* chkpt sequence # */ xfs_lsn_t start_lsn; /* first LSN of chkpt commit */ xfs_lsn_t commit_lsn; /* chkpt commit record lsn */ struct xlog_ticket *ticket; /* chkpt ticket */ int nvecs; /* number of regions */ int space_used; /* aggregate size of regions */ struct list_head busy_extents; /* busy extents in chkpt */ struct xfs_log_vec *lv_chain; /* logvecs being pushed */ struct xfs_log_callback log_cb; /* completion callback hook. */ struct list_head committing; /* ctx committing list */ struct work_struct discard_endio_work; }; /* * Committed Item List structure * * This structure is used to track log items that have been committed but not * yet written into the log. It is used only when the delayed logging mount * option is enabled. * * This structure tracks the list of committing checkpoint contexts so * we can avoid the problem of having to hold out new transactions during a * flush until we have a the commit record LSN of the checkpoint. We can * traverse the list of committing contexts in xlog_cil_push_lsn() to find a * sequence match and extract the commit LSN directly from there. If the * checkpoint is still in the process of committing, we can block waiting for * the commit LSN to be determined as well. This should make synchronous * operations almost as efficient as the old logging methods. */ struct xfs_cil { struct xlog *xc_log; struct list_head xc_cil; spinlock_t xc_cil_lock; struct rw_semaphore xc_ctx_lock ____cacheline_aligned_in_smp; struct xfs_cil_ctx *xc_ctx; spinlock_t xc_push_lock ____cacheline_aligned_in_smp; xfs_lsn_t xc_push_seq; struct list_head xc_committing; wait_queue_head_t xc_commit_wait; xfs_lsn_t xc_current_sequence; struct work_struct xc_push_work; } ____cacheline_aligned_in_smp; /* * The amount of log space we allow the CIL to aggregate is difficult to size. * Whatever we choose, we have to make sure we can get a reservation for the * log space effectively, that it is large enough to capture sufficient * relogging to reduce log buffer IO significantly, but it is not too large for * the log or induces too much latency when writing out through the iclogs. We * track both space consumed and the number of vectors in the checkpoint * context, so we need to decide which to use for limiting. * * Every log buffer we write out during a push needs a header reserved, which * is at least one sector and more for v2 logs. Hence we need a reservation of * at least 512 bytes per 32k of log space just for the LR headers. That means * 16KB of reservation per megabyte of delayed logging space we will consume, * plus various headers. The number of headers will vary based on the num of * io vectors, so limiting on a specific number of vectors is going to result * in transactions of varying size. IOWs, it is more consistent to track and * limit space consumed in the log rather than by the number of objects being * logged in order to prevent checkpoint ticket overruns. * * Further, use of static reservations through the log grant mechanism is * problematic. It introduces a lot of complexity (e.g. reserve grant vs write * grant) and a significant deadlock potential because regranting write space * can block on log pushes. Hence if we have to regrant log space during a log * push, we can deadlock. * * However, we can avoid this by use of a dynamic "reservation stealing" * technique during transaction commit whereby unused reservation space in the * transaction ticket is transferred to the CIL ctx commit ticket to cover the * space needed by the checkpoint transaction. This means that we never need to * specifically reserve space for the CIL checkpoint transaction, nor do we * need to regrant space once the checkpoint completes. This also means the * checkpoint transaction ticket is specific to the checkpoint context, rather * than the CIL itself. * * With dynamic reservations, we can effectively make up arbitrary limits for * the checkpoint size so long as they don't violate any other size rules. * Recovery imposes a rule that no transaction exceed half the log, so we are * limited by that. Furthermore, the log transaction reservation subsystem * tries to keep 25% of the log free, so we need to keep below that limit or we * risk running out of free log space to start any new transactions. * * In order to keep background CIL push efficient, we will set a lower * threshold at which background pushing is attempted without blocking current * transaction commits. A separate, higher bound defines when CIL pushes are * enforced to ensure we stay within our maximum checkpoint size bounds. * threshold, yet give us plenty of space for aggregation on large logs. */ #define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3) /* * ticket grant locks, queues and accounting have their own cachlines * as these are quite hot and can be operated on concurrently. */ struct xlog_grant_head { spinlock_t lock ____cacheline_aligned_in_smp; struct list_head waiters; atomic64_t grant; }; /* * The reservation head lsn is not made up of a cycle number and block number. * Instead, it uses a cycle number and byte number. Logs don't expect to * overflow 31 bits worth of byte offset, so using a byte number will mean * that round off problems won't occur when releasing partial reservations. */ struct xlog { /* The following fields don't need locking */ struct xfs_mount *l_mp; /* mount point */ struct xfs_ail *l_ailp; /* AIL log is working with */ struct xfs_cil *l_cilp; /* CIL log is working with */ struct xfs_buf *l_xbuf; /* extra buffer for log * wrapping */ struct xfs_buftarg *l_targ; /* buftarg of log */ struct delayed_work l_work; /* background flush work */ uint l_flags; uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ struct list_head *l_buf_cancel_table; int l_iclog_hsize; /* size of iclog header */ int l_iclog_heads; /* # of iclog header sectors */ uint l_sectBBsize; /* sector size in BBs (2^n) */ int l_iclog_size; /* size of log in bytes */ int l_iclog_size_log; /* log power size of log */ int l_iclog_bufs; /* number of iclog buffers */ xfs_daddr_t l_logBBstart; /* start block of log */ int l_logsize; /* size of log in bytes */ int l_logBBsize; /* size of log in BB chunks */ /* The following block of fields are changed while holding icloglock */ wait_queue_head_t l_flush_wait ____cacheline_aligned_in_smp; /* waiting for iclog flush */ int l_covered_state;/* state of "covering disk * log entries" */ xlog_in_core_t *l_iclog; /* head log queue */ spinlock_t l_icloglock; /* grab to change iclog state */ int l_curr_cycle; /* Cycle number of log writes */ int l_prev_cycle; /* Cycle number before last * block increment */ int l_curr_block; /* current logical log block */ int l_prev_block; /* previous logical log block */ /* * l_last_sync_lsn and l_tail_lsn are atomics so they can be set and * read without needing to hold specific locks. To avoid operations * contending with other hot objects, place each of them on a separate * cacheline. */ /* lsn of last LR on disk */ atomic64_t l_last_sync_lsn ____cacheline_aligned_in_smp; /* lsn of 1st LR with unflushed * buffers */ atomic64_t l_tail_lsn ____cacheline_aligned_in_smp; struct xlog_grant_head l_reserve_head; struct xlog_grant_head l_write_head; struct xfs_kobj l_kobj; /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG void *l_iclog_bak[XLOG_MAX_ICLOGS]; /* log record crc error injection factor */ uint32_t l_badcrc_factor; #endif /* log recovery lsn tracking (for buffer submission */ xfs_lsn_t l_recovery_lsn; }; #define XLOG_BUF_CANCEL_BUCKET(log, blkno) \ ((log)->l_buf_cancel_table + ((uint64_t)blkno % XLOG_BC_TABLE_SIZE)) #define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) /* common routines */ extern int xlog_recover( struct xlog *log); extern int xlog_recover_finish( struct xlog *log); extern int xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); extern kmem_zone_t *xfs_log_ticket_zone; struct xlog_ticket * xlog_ticket_alloc( struct xlog *log, int unit_bytes, int count, char client, bool permanent, xfs_km_flags_t alloc_flags); static inline void xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes) { *ptr += bytes; *len -= bytes; *off += bytes; } void xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket); void xlog_print_trans(struct xfs_trans *); int xlog_write( struct xlog *log, struct xfs_log_vec *log_vector, struct xlog_ticket *tic, xfs_lsn_t *start_lsn, struct xlog_in_core **commit_iclog, uint flags); /* * When we crack an atomic LSN, we sample it first so that the value will not * change while we are cracking it into the component values. This means we * will always get consistent component values to work from. This should always * be used to sample and crack LSNs that are stored and updated in atomic * variables. */ static inline void xlog_crack_atomic_lsn(atomic64_t *lsn, uint *cycle, uint *block) { xfs_lsn_t val = atomic64_read(lsn); *cycle = CYCLE_LSN(val); *block = BLOCK_LSN(val); } /* * Calculate and assign a value to an atomic LSN variable from component pieces. */ static inline void xlog_assign_atomic_lsn(atomic64_t *lsn, uint cycle, uint block) { atomic64_set(lsn, xlog_assign_lsn(cycle, block)); } /* * When we crack the grant head, we sample it first so that the value will not * change while we are cracking it into the component values. This means we * will always get consistent component values to work from. */ static inline void xlog_crack_grant_head_val(int64_t val, int *cycle, int *space) { *cycle = val >> 32; *space = val & 0xffffffff; } static inline void xlog_crack_grant_head(atomic64_t *head, int *cycle, int *space) { xlog_crack_grant_head_val(atomic64_read(head), cycle, space); } static inline int64_t xlog_assign_grant_head_val(int cycle, int space) { return ((int64_t)cycle << 32) | space; } static inline void xlog_assign_grant_head(atomic64_t *head, int cycle, int space) { atomic64_set(head, xlog_assign_grant_head_val(cycle, space)); } /* * Committed Item List interfaces */ int xlog_cil_init(struct xlog *log); void xlog_cil_init_post_recovery(struct xlog *log); void xlog_cil_destroy(struct xlog *log); bool xlog_cil_empty(struct xlog *log); /* * CIL force routines */ xfs_lsn_t xlog_cil_force_lsn( struct xlog *log, xfs_lsn_t sequence); static inline void xlog_cil_force(struct xlog *log) { xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); } /* * Unmount record type is used as a pseudo transaction type for the ticket. * It's value must be outside the range of XFS_TRANS_* values. */ #define XLOG_UNMOUNT_REC_TYPE (-1U) /* * Wrapper function for waiting on a wait queue serialised against wakeups * by a spinlock. This matches the semantics of all the wait queues used in the * log code. */ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock) { DECLARE_WAITQUEUE(wait, current); add_wait_queue_exclusive(wq, &wait); __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(lock); schedule(); remove_wait_queue(wq, &wait); } /* * The LSN is valid so long as it is behind the current LSN. If it isn't, this * means that the next log record that includes this metadata could have a * smaller LSN. In turn, this means that the modification in the log would not * replay. */ static inline bool xlog_valid_lsn( struct xlog *log, xfs_lsn_t lsn) { int cur_cycle; int cur_block; bool valid = true; /* * First, sample the current lsn without locking to avoid added * contention from metadata I/O. The current cycle and block are updated * (in xlog_state_switch_iclogs()) and read here in a particular order * to avoid false negatives (e.g., thinking the metadata LSN is valid * when it is not). * * The current block is always rewound before the cycle is bumped in * xlog_state_switch_iclogs() to ensure the current LSN is never seen in * a transiently forward state. Instead, we can see the LSN in a * transiently behind state if we happen to race with a cycle wrap. */ cur_cycle = ACCESS_ONCE(log->l_curr_cycle); smp_rmb(); cur_block = ACCESS_ONCE(log->l_curr_block); if ((CYCLE_LSN(lsn) > cur_cycle) || (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) { /* * If the metadata LSN appears invalid, it's possible the check * above raced with a wrap to the next log cycle. Grab the lock * to check for sure. */ spin_lock(&log->l_icloglock); cur_cycle = log->l_curr_cycle; cur_block = log->l_curr_block; spin_unlock(&log->l_icloglock); if ((CYCLE_LSN(lsn) > cur_cycle) || (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) valid = false; } return valid; } #endif /* __XFS_LOG_PRIV_H__ */
83 131 44 80 166 116 116 33 134 131 44 166 82 204 204 204 32 32 116 116 116 116 116 116 33 1 73 1 15 5 5 5 5 6 7 7 1 1 2 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 /* * Dummy soundcard * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #include <linux/init.h> #include <linux/err.h> #include <linux/platform_device.h> #include <linux/jiffies.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/wait.h> #include <linux/hrtimer.h> #include <linux/math64.h> #include <linux/module.h> #include <sound/core.h> #include <sound/control.h> #include <sound/tlv.h> #include <sound/pcm.h> #include <sound/rawmidi.h> #include <sound/info.h> #include <sound/initval.h> MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Dummy soundcard (/dev/null)"); MODULE_LICENSE("GPL"); MODULE_SUPPORTED_DEVICE("{{ALSA,Dummy soundcard}}"); #define MAX_PCM_DEVICES 4 #define MAX_PCM_SUBSTREAMS 128 #define MAX_MIDI_DEVICES 2 /* defaults */ #define MAX_BUFFER_SIZE (64*1024) #define MIN_PERIOD_SIZE 64 #define MAX_PERIOD_SIZE MAX_BUFFER_SIZE #define USE_FORMATS (SNDRV_PCM_FMTBIT_U8 | SNDRV_PCM_FMTBIT_S16_LE) #define USE_RATE SNDRV_PCM_RATE_CONTINUOUS | SNDRV_PCM_RATE_8000_48000 #define USE_RATE_MIN 5500 #define USE_RATE_MAX 48000 #define USE_CHANNELS_MIN 1 #define USE_CHANNELS_MAX 2 #define USE_PERIODS_MIN 1 #define USE_PERIODS_MAX 1024 static int index[SNDRV_CARDS] = SNDRV_DEFAULT_IDX; /* Index 0-MAX */ static char *id[SNDRV_CARDS] = SNDRV_DEFAULT_STR; /* ID for this card */ static bool enable[SNDRV_CARDS] = {1, [1 ... (SNDRV_CARDS - 1)] = 0}; static char *model[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = NULL}; static int pcm_devs[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = 1}; static int pcm_substreams[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = 8}; //static int midi_devs[SNDRV_CARDS] = {[0 ... (SNDRV_CARDS - 1)] = 2}; #ifdef CONFIG_HIGH_RES_TIMERS static bool hrtimer = 1; #endif static bool fake_buffer = 1; module_param_array(index, int, NULL, 0444); MODULE_PARM_DESC(index, "Index value for dummy soundcard."); module_param_array(id, charp, NULL, 0444); MODULE_PARM_DESC(id, "ID string for dummy soundcard."); module_param_array(enable, bool, NULL, 0444); MODULE_PARM_DESC(enable, "Enable this dummy soundcard."); module_param_array(model, charp, NULL, 0444); MODULE_PARM_DESC(model, "Soundcard model."); module_param_array(pcm_devs, int, NULL, 0444); MODULE_PARM_DESC(pcm_devs, "PCM devices # (0-4) for dummy driver."); module_param_array(pcm_substreams, int, NULL, 0444); MODULE_PARM_DESC(pcm_substreams, "PCM substreams # (1-128) for dummy driver."); //module_param_array(midi_devs, int, NULL, 0444); //MODULE_PARM_DESC(midi_devs, "MIDI devices # (0-2) for dummy driver."); module_param(fake_buffer, bool, 0444); MODULE_PARM_DESC(fake_buffer, "Fake buffer allocations."); #ifdef CONFIG_HIGH_RES_TIMERS module_param(hrtimer, bool, 0644); MODULE_PARM_DESC(hrtimer, "Use hrtimer as the timer source."); #endif static struct platform_device *devices[SNDRV_CARDS]; #define MIXER_ADDR_MASTER 0 #define MIXER_ADDR_LINE 1 #define MIXER_ADDR_MIC 2 #define MIXER_ADDR_SYNTH 3 #define MIXER_ADDR_CD 4 #define MIXER_ADDR_LAST 4 struct dummy_timer_ops { int (*create)(struct snd_pcm_substream *); void (*free)(struct snd_pcm_substream *); int (*prepare)(struct snd_pcm_substream *); int (*start)(struct snd_pcm_substream *); int (*stop)(struct snd_pcm_substream *); snd_pcm_uframes_t (*pointer)(struct snd_pcm_substream *); }; #define get_dummy_ops(substream) \ (*(const struct dummy_timer_ops **)(substream)->runtime->private_data) struct dummy_model { const char *name; int (*playback_constraints)(struct snd_pcm_runtime *runtime); int (*capture_constraints)(struct snd_pcm_runtime *runtime); u64 formats; size_t buffer_bytes_max; size_t period_bytes_min; size_t period_bytes_max; unsigned int periods_min; unsigned int periods_max; unsigned int rates; unsigned int rate_min; unsigned int rate_max; unsigned int channels_min; unsigned int channels_max; }; struct snd_dummy { struct snd_card *card; struct dummy_model *model; struct snd_pcm *pcm; struct snd_pcm_hardware pcm_hw; spinlock_t mixer_lock; int mixer_volume[MIXER_ADDR_LAST+1][2]; int capture_source[MIXER_ADDR_LAST+1][2]; int iobox; struct snd_kcontrol *cd_volume_ctl; struct snd_kcontrol *cd_switch_ctl; }; /* * card models */ static int emu10k1_playback_constraints(struct snd_pcm_runtime *runtime) { int err; err = snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIODS); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 256, UINT_MAX); if (err < 0) return err; return 0; } static struct dummy_model model_emu10k1 = { .name = "emu10k1", .playback_constraints = emu10k1_playback_constraints, .buffer_bytes_max = 128 * 1024, }; static struct dummy_model model_rme9652 = { .name = "rme9652", .buffer_bytes_max = 26 * 64 * 1024, .formats = SNDRV_PCM_FMTBIT_S32_LE, .channels_min = 26, .channels_max = 26, .periods_min = 2, .periods_max = 2, }; static struct dummy_model model_ice1712 = { .name = "ice1712", .buffer_bytes_max = 256 * 1024, .formats = SNDRV_PCM_FMTBIT_S32_LE, .channels_min = 10, .channels_max = 10, .periods_min = 1, .periods_max = 1024, }; static struct dummy_model model_uda1341 = { .name = "uda1341", .buffer_bytes_max = 16380, .formats = SNDRV_PCM_FMTBIT_S16_LE, .channels_min = 2, .channels_max = 2, .periods_min = 2, .periods_max = 255, }; static struct dummy_model model_ac97 = { .name = "ac97", .formats = SNDRV_PCM_FMTBIT_S16_LE, .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000, .rate_min = 48000, .rate_max = 48000, }; static struct dummy_model model_ca0106 = { .name = "ca0106", .formats = SNDRV_PCM_FMTBIT_S16_LE, .buffer_bytes_max = ((65536-64)*8), .period_bytes_max = (65536-64), .periods_min = 2, .periods_max = 8, .channels_min = 2, .channels_max = 2, .rates = SNDRV_PCM_RATE_48000|SNDRV_PCM_RATE_96000|SNDRV_PCM_RATE_192000, .rate_min = 48000, .rate_max = 192000, }; static struct dummy_model *dummy_models[] = { &model_emu10k1, &model_rme9652, &model_ice1712, &model_uda1341, &model_ac97, &model_ca0106, NULL }; /* * system timer interface */ struct dummy_systimer_pcm { /* ops must be the first item */ const struct dummy_timer_ops *timer_ops; spinlock_t lock; struct timer_list timer; unsigned long base_time; unsigned int frac_pos; /* fractional sample position (based HZ) */ unsigned int frac_period_rest; unsigned int frac_buffer_size; /* buffer_size * HZ */ unsigned int frac_period_size; /* period_size * HZ */ unsigned int rate; int elapsed; struct snd_pcm_substream *substream; }; static void dummy_systimer_rearm(struct dummy_systimer_pcm *dpcm) { mod_timer(&dpcm->timer, jiffies + (dpcm->frac_period_rest + dpcm->rate - 1) / dpcm->rate); } static void dummy_systimer_update(struct dummy_systimer_pcm *dpcm) { unsigned long delta; delta = jiffies - dpcm->base_time; if (!delta) return; dpcm->base_time += delta; delta *= dpcm->rate; dpcm->frac_pos += delta; while (dpcm->frac_pos >= dpcm->frac_buffer_size) dpcm->frac_pos -= dpcm->frac_buffer_size; while (dpcm->frac_period_rest <= delta) { dpcm->elapsed++; dpcm->frac_period_rest += dpcm->frac_period_size; } dpcm->frac_period_rest -= delta; } static int dummy_systimer_start(struct snd_pcm_substream *substream) { struct dummy_systimer_pcm *dpcm = substream->runtime->private_data; spin_lock(&dpcm->lock); dpcm->base_time = jiffies; dummy_systimer_rearm(dpcm); spin_unlock(&dpcm->lock); return 0; } static int dummy_systimer_stop(struct snd_pcm_substream *substream) { struct dummy_systimer_pcm *dpcm = substream->runtime->private_data; spin_lock(&dpcm->lock); del_timer(&dpcm->timer); spin_unlock(&dpcm->lock); return 0; } static int dummy_systimer_prepare(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct dummy_systimer_pcm *dpcm = runtime->private_data; dpcm->frac_pos = 0; dpcm->rate = runtime->rate; dpcm->frac_buffer_size = runtime->buffer_size * HZ; dpcm->frac_period_size = runtime->period_size * HZ; dpcm->frac_period_rest = dpcm->frac_period_size; dpcm->elapsed = 0; return 0; } static void dummy_systimer_callback(unsigned long data) { struct dummy_systimer_pcm *dpcm = (struct dummy_systimer_pcm *)data; unsigned long flags; int elapsed = 0; spin_lock_irqsave(&dpcm->lock, flags); dummy_systimer_update(dpcm); dummy_systimer_rearm(dpcm); elapsed = dpcm->elapsed; dpcm->elapsed = 0; spin_unlock_irqrestore(&dpcm->lock, flags); if (elapsed) snd_pcm_period_elapsed(dpcm->substream); } static snd_pcm_uframes_t dummy_systimer_pointer(struct snd_pcm_substream *substream) { struct dummy_systimer_pcm *dpcm = substream->runtime->private_data; snd_pcm_uframes_t pos; spin_lock(&dpcm->lock); dummy_systimer_update(dpcm); pos = dpcm->frac_pos / HZ; spin_unlock(&dpcm->lock); return pos; } static int dummy_systimer_create(struct snd_pcm_substream *substream) { struct dummy_systimer_pcm *dpcm; dpcm = kzalloc(sizeof(*dpcm), GFP_KERNEL); if (!dpcm) return -ENOMEM; substream->runtime->private_data = dpcm; setup_timer(&dpcm->timer, dummy_systimer_callback, (unsigned long) dpcm); spin_lock_init(&dpcm->lock); dpcm->substream = substream; return 0; } static void dummy_systimer_free(struct snd_pcm_substream *substream) { kfree(substream->runtime->private_data); } static const struct dummy_timer_ops dummy_systimer_ops = { .create = dummy_systimer_create, .free = dummy_systimer_free, .prepare = dummy_systimer_prepare, .start = dummy_systimer_start, .stop = dummy_systimer_stop, .pointer = dummy_systimer_pointer, }; #ifdef CONFIG_HIGH_RES_TIMERS /* * hrtimer interface */ struct dummy_hrtimer_pcm { /* ops must be the first item */ const struct dummy_timer_ops *timer_ops; ktime_t base_time; ktime_t period_time; atomic_t running; struct hrtimer timer; struct tasklet_struct tasklet; struct snd_pcm_substream *substream; }; static void dummy_hrtimer_pcm_elapsed(unsigned long priv) { struct dummy_hrtimer_pcm *dpcm = (struct dummy_hrtimer_pcm *)priv; if (atomic_read(&dpcm->running)) snd_pcm_period_elapsed(dpcm->substream); } static enum hrtimer_restart dummy_hrtimer_callback(struct hrtimer *timer) { struct dummy_hrtimer_pcm *dpcm; dpcm = container_of(timer, struct dummy_hrtimer_pcm, timer); if (!atomic_read(&dpcm->running)) return HRTIMER_NORESTART; tasklet_schedule(&dpcm->tasklet); hrtimer_forward_now(timer, dpcm->period_time); return HRTIMER_RESTART; } static int dummy_hrtimer_start(struct snd_pcm_substream *substream) { struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; dpcm->base_time = hrtimer_cb_get_time(&dpcm->timer); hrtimer_start(&dpcm->timer, dpcm->period_time, HRTIMER_MODE_REL); atomic_set(&dpcm->running, 1); return 0; } static int dummy_hrtimer_stop(struct snd_pcm_substream *substream) { struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; atomic_set(&dpcm->running, 0); hrtimer_cancel(&dpcm->timer); return 0; } static inline void dummy_hrtimer_sync(struct dummy_hrtimer_pcm *dpcm) { hrtimer_cancel(&dpcm->timer); tasklet_kill(&dpcm->tasklet); } static snd_pcm_uframes_t dummy_hrtimer_pointer(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct dummy_hrtimer_pcm *dpcm = runtime->private_data; u64 delta; u32 pos; delta = ktime_us_delta(hrtimer_cb_get_time(&dpcm->timer), dpcm->base_time); delta = div_u64(delta * runtime->rate + 999999, 1000000); div_u64_rem(delta, runtime->buffer_size, &pos); return pos; } static int dummy_hrtimer_prepare(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct dummy_hrtimer_pcm *dpcm = runtime->private_data; unsigned int period, rate; long sec; unsigned long nsecs; dummy_hrtimer_sync(dpcm); period = runtime->period_size; rate = runtime->rate; sec = period / rate; period %= rate; nsecs = div_u64((u64)period * 1000000000UL + rate - 1, rate); dpcm->period_time = ktime_set(sec, nsecs); return 0; } static int dummy_hrtimer_create(struct snd_pcm_substream *substream) { struct dummy_hrtimer_pcm *dpcm; dpcm = kzalloc(sizeof(*dpcm), GFP_KERNEL); if (!dpcm) return -ENOMEM; substream->runtime->private_data = dpcm; hrtimer_init(&dpcm->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); dpcm->timer.function = dummy_hrtimer_callback; dpcm->substream = substream; atomic_set(&dpcm->running, 0); tasklet_init(&dpcm->tasklet, dummy_hrtimer_pcm_elapsed, (unsigned long)dpcm); return 0; } static void dummy_hrtimer_free(struct snd_pcm_substream *substream) { struct dummy_hrtimer_pcm *dpcm = substream->runtime->private_data; dummy_hrtimer_sync(dpcm); kfree(dpcm); } static const struct dummy_timer_ops dummy_hrtimer_ops = { .create = dummy_hrtimer_create, .free = dummy_hrtimer_free, .prepare = dummy_hrtimer_prepare, .start = dummy_hrtimer_start, .stop = dummy_hrtimer_stop, .pointer = dummy_hrtimer_pointer, }; #endif /* CONFIG_HIGH_RES_TIMERS */ /* * PCM interface */ static int dummy_pcm_trigger(struct snd_pcm_substream *substream, int cmd) { switch (cmd) { case SNDRV_PCM_TRIGGER_START: case SNDRV_PCM_TRIGGER_RESUME: return get_dummy_ops(substream)->start(substream); case SNDRV_PCM_TRIGGER_STOP: case SNDRV_PCM_TRIGGER_SUSPEND: return get_dummy_ops(substream)->stop(substream); } return -EINVAL; } static int dummy_pcm_prepare(struct snd_pcm_substream *substream) { return get_dummy_ops(substream)->prepare(substream); } static snd_pcm_uframes_t dummy_pcm_pointer(struct snd_pcm_substream *substream) { return get_dummy_ops(substream)->pointer(substream); } static const struct snd_pcm_hardware dummy_pcm_hardware = { .info = (SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_INTERLEAVED | SNDRV_PCM_INFO_RESUME | SNDRV_PCM_INFO_MMAP_VALID), .formats = USE_FORMATS, .rates = USE_RATE, .rate_min = USE_RATE_MIN, .rate_max = USE_RATE_MAX, .channels_min = USE_CHANNELS_MIN, .channels_max = USE_CHANNELS_MAX, .buffer_bytes_max = MAX_BUFFER_SIZE, .period_bytes_min = MIN_PERIOD_SIZE, .period_bytes_max = MAX_PERIOD_SIZE, .periods_min = USE_PERIODS_MIN, .periods_max = USE_PERIODS_MAX, .fifo_size = 0, }; static int dummy_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *hw_params) { if (fake_buffer) { /* runtime->dma_bytes has to be set manually to allow mmap */ substream->runtime->dma_bytes = params_buffer_bytes(hw_params); return 0; } return snd_pcm_lib_malloc_pages(substream, params_buffer_bytes(hw_params)); } static int dummy_pcm_hw_free(struct snd_pcm_substream *substream) { if (fake_buffer) return 0; return snd_pcm_lib_free_pages(substream); } static int dummy_pcm_open(struct snd_pcm_substream *substream) { struct snd_dummy *dummy = snd_pcm_substream_chip(substream); struct dummy_model *model = dummy->model; struct snd_pcm_runtime *runtime = substream->runtime; const struct dummy_timer_ops *ops; int err; ops = &dummy_systimer_ops; #ifdef CONFIG_HIGH_RES_TIMERS if (hrtimer) ops = &dummy_hrtimer_ops; #endif err = ops->create(substream); if (err < 0) return err; get_dummy_ops(substream) = ops; runtime->hw = dummy->pcm_hw; if (substream->pcm->device & 1) { runtime->hw.info &= ~SNDRV_PCM_INFO_INTERLEAVED; runtime->hw.info |= SNDRV_PCM_INFO_NONINTERLEAVED; } if (substream->pcm->device & 2) runtime->hw.info &= ~(SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID); if (model == NULL) return 0; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { if (model->playback_constraints) err = model->playback_constraints(substream->runtime); } else { if (model->capture_constraints) err = model->capture_constraints(substream->runtime); } if (err < 0) { get_dummy_ops(substream)->free(substream); return err; } return 0; } static int dummy_pcm_close(struct snd_pcm_substream *substream) { get_dummy_ops(substream)->free(substream); return 0; } /* * dummy buffer handling */ static void *dummy_page[2]; static void free_fake_buffer(void) { if (fake_buffer) { int i; for (i = 0; i < 2; i++) if (dummy_page[i]) { free_page((unsigned long)dummy_page[i]); dummy_page[i] = NULL; } } } static int alloc_fake_buffer(void) { int i; if (!fake_buffer) return 0; for (i = 0; i < 2; i++) { dummy_page[i] = (void *)get_zeroed_page(GFP_KERNEL); if (!dummy_page[i]) { free_fake_buffer(); return -ENOMEM; } } return 0; } static int dummy_pcm_copy(struct snd_pcm_substream *substream, int channel, unsigned long pos, void __user *dst, unsigned long bytes) { return 0; /* do nothing */ } static int dummy_pcm_copy_kernel(struct snd_pcm_substream *substream, int channel, unsigned long pos, void *dst, unsigned long bytes) { return 0; /* do nothing */ } static int dummy_pcm_silence(struct snd_pcm_substream *substream, int channel, unsigned long pos, unsigned long bytes) { return 0; /* do nothing */ } static struct page *dummy_pcm_page(struct snd_pcm_substream *substream, unsigned long offset) { return virt_to_page(dummy_page[substream->stream]); /* the same page */ } static struct snd_pcm_ops dummy_pcm_ops = { .open = dummy_pcm_open, .close = dummy_pcm_close, .ioctl = snd_pcm_lib_ioctl, .hw_params = dummy_pcm_hw_params, .hw_free = dummy_pcm_hw_free, .prepare = dummy_pcm_prepare, .trigger = dummy_pcm_trigger, .pointer = dummy_pcm_pointer, }; static struct snd_pcm_ops dummy_pcm_ops_no_buf = { .open = dummy_pcm_open, .close = dummy_pcm_close, .ioctl = snd_pcm_lib_ioctl, .hw_params = dummy_pcm_hw_params, .hw_free = dummy_pcm_hw_free, .prepare = dummy_pcm_prepare, .trigger = dummy_pcm_trigger, .pointer = dummy_pcm_pointer, .copy_user = dummy_pcm_copy, .copy_kernel = dummy_pcm_copy_kernel, .fill_silence = dummy_pcm_silence, .page = dummy_pcm_page, }; static int snd_card_dummy_pcm(struct snd_dummy *dummy, int device, int substreams) { struct snd_pcm *pcm; struct snd_pcm_ops *ops; int err; err = snd_pcm_new(dummy->card, "Dummy PCM", device, substreams, substreams, &pcm); if (err < 0) return err; dummy->pcm = pcm; if (fake_buffer) ops = &dummy_pcm_ops_no_buf; else ops = &dummy_pcm_ops; snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, ops); snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_CAPTURE, ops); pcm->private_data = dummy; pcm->info_flags = 0; strcpy(pcm->name, "Dummy PCM"); if (!fake_buffer) { snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_CONTINUOUS, snd_dma_continuous_data(GFP_KERNEL), 0, 64*1024); } return 0; } /* * mixer interface */ #define DUMMY_VOLUME(xname, xindex, addr) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, \ .access = SNDRV_CTL_ELEM_ACCESS_READWRITE | SNDRV_CTL_ELEM_ACCESS_TLV_READ, \ .name = xname, .index = xindex, \ .info = snd_dummy_volume_info, \ .get = snd_dummy_volume_get, .put = snd_dummy_volume_put, \ .private_value = addr, \ .tlv = { .p = db_scale_dummy } } static int snd_dummy_volume_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *uinfo) { uinfo->type = SNDRV_CTL_ELEM_TYPE_INTEGER; uinfo->count = 2; uinfo->value.integer.min = -50; uinfo->value.integer.max = 100; return 0; } static int snd_dummy_volume_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int addr = kcontrol->private_value; spin_lock_irq(&dummy->mixer_lock); ucontrol->value.integer.value[0] = dummy->mixer_volume[addr][0]; ucontrol->value.integer.value[1] = dummy->mixer_volume[addr][1]; spin_unlock_irq(&dummy->mixer_lock); return 0; } static int snd_dummy_volume_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int change, addr = kcontrol->private_value; int left, right; left = ucontrol->value.integer.value[0]; if (left < -50) left = -50; if (left > 100) left = 100; right = ucontrol->value.integer.value[1]; if (right < -50) right = -50; if (right > 100) right = 100; spin_lock_irq(&dummy->mixer_lock); change = dummy->mixer_volume[addr][0] != left || dummy->mixer_volume[addr][1] != right; dummy->mixer_volume[addr][0] = left; dummy->mixer_volume[addr][1] = right; spin_unlock_irq(&dummy->mixer_lock); return change; } static const DECLARE_TLV_DB_SCALE(db_scale_dummy, -4500, 30, 0); #define DUMMY_CAPSRC(xname, xindex, addr) \ { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = xname, .index = xindex, \ .info = snd_dummy_capsrc_info, \ .get = snd_dummy_capsrc_get, .put = snd_dummy_capsrc_put, \ .private_value = addr } #define snd_dummy_capsrc_info snd_ctl_boolean_stereo_info static int snd_dummy_capsrc_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int addr = kcontrol->private_value; spin_lock_irq(&dummy->mixer_lock); ucontrol->value.integer.value[0] = dummy->capture_source[addr][0]; ucontrol->value.integer.value[1] = dummy->capture_source[addr][1]; spin_unlock_irq(&dummy->mixer_lock); return 0; } static int snd_dummy_capsrc_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *ucontrol) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int change, addr = kcontrol->private_value; int left, right; left = ucontrol->value.integer.value[0] & 1; right = ucontrol->value.integer.value[1] & 1; spin_lock_irq(&dummy->mixer_lock); change = dummy->capture_source[addr][0] != left && dummy->capture_source[addr][1] != right; dummy->capture_source[addr][0] = left; dummy->capture_source[addr][1] = right; spin_unlock_irq(&dummy->mixer_lock); return change; } static int snd_dummy_iobox_info(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_info *info) { const char *const names[] = { "None", "CD Player" }; return snd_ctl_enum_info(info, 1, 2, names); } static int snd_dummy_iobox_get(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *value) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); value->value.enumerated.item[0] = dummy->iobox; return 0; } static int snd_dummy_iobox_put(struct snd_kcontrol *kcontrol, struct snd_ctl_elem_value *value) { struct snd_dummy *dummy = snd_kcontrol_chip(kcontrol); int changed; if (value->value.enumerated.item[0] > 1) return -EINVAL; changed = value->value.enumerated.item[0] != dummy->iobox; if (changed) { dummy->iobox = value->value.enumerated.item[0]; if (dummy->iobox) { dummy->cd_volume_ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; dummy->cd_switch_ctl->vd[0].access &= ~SNDRV_CTL_ELEM_ACCESS_INACTIVE; } else { dummy->cd_volume_ctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; dummy->cd_switch_ctl->vd[0].access |= SNDRV_CTL_ELEM_ACCESS_INACTIVE; } snd_ctl_notify(dummy->card, SNDRV_CTL_EVENT_MASK_INFO, &dummy->cd_volume_ctl->id); snd_ctl_notify(dummy->card, SNDRV_CTL_EVENT_MASK_INFO, &dummy->cd_switch_ctl->id); } return changed; } static struct snd_kcontrol_new snd_dummy_controls[] = { DUMMY_VOLUME("Master Volume", 0, MIXER_ADDR_MASTER), DUMMY_CAPSRC("Master Capture Switch", 0, MIXER_ADDR_MASTER), DUMMY_VOLUME("Synth Volume", 0, MIXER_ADDR_SYNTH), DUMMY_CAPSRC("Synth Capture Switch", 0, MIXER_ADDR_SYNTH), DUMMY_VOLUME("Line Volume", 0, MIXER_ADDR_LINE), DUMMY_CAPSRC("Line Capture Switch", 0, MIXER_ADDR_LINE), DUMMY_VOLUME("Mic Volume", 0, MIXER_ADDR_MIC), DUMMY_CAPSRC("Mic Capture Switch", 0, MIXER_ADDR_MIC), DUMMY_VOLUME("CD Volume", 0, MIXER_ADDR_CD), DUMMY_CAPSRC("CD Capture Switch", 0, MIXER_ADDR_CD), { .iface = SNDRV_CTL_ELEM_IFACE_MIXER, .name = "External I/O Box", .info = snd_dummy_iobox_info, .get = snd_dummy_iobox_get, .put = snd_dummy_iobox_put, }, }; static int snd_card_dummy_new_mixer(struct snd_dummy *dummy) { struct snd_card *card = dummy->card; struct snd_kcontrol *kcontrol; unsigned int idx; int err; spin_lock_init(&dummy->mixer_lock); strcpy(card->mixername, "Dummy Mixer"); dummy->iobox = 1; for (idx = 0; idx < ARRAY_SIZE(snd_dummy_controls); idx++) { kcontrol = snd_ctl_new1(&snd_dummy_controls[idx], dummy); err = snd_ctl_add(card, kcontrol); if (err < 0) return err; if (!strcmp(kcontrol->id.name, "CD Volume")) dummy->cd_volume_ctl = kcontrol; else if (!strcmp(kcontrol->id.name, "CD Capture Switch")) dummy->cd_switch_ctl = kcontrol; } return 0; } #if defined(CONFIG_SND_DEBUG) && defined(CONFIG_SND_PROC_FS) /* * proc interface */ static void print_formats(struct snd_dummy *dummy, struct snd_info_buffer *buffer) { int i; for (i = 0; i <= SNDRV_PCM_FORMAT_LAST; i++) { if (dummy->pcm_hw.formats & (1ULL << i)) snd_iprintf(buffer, " %s", snd_pcm_format_name(i)); } } static void print_rates(struct snd_dummy *dummy, struct snd_info_buffer *buffer) { static int rates[] = { 5512, 8000, 11025, 16000, 22050, 32000, 44100, 48000, 64000, 88200, 96000, 176400, 192000, }; int i; if (dummy->pcm_hw.rates & SNDRV_PCM_RATE_CONTINUOUS) snd_iprintf(buffer, " continuous"); if (dummy->pcm_hw.rates & SNDRV_PCM_RATE_KNOT) snd_iprintf(buffer, " knot"); for (i = 0; i < ARRAY_SIZE(rates); i++) if (dummy->pcm_hw.rates & (1 << i)) snd_iprintf(buffer, " %d", rates[i]); } #define get_dummy_int_ptr(dummy, ofs) \ (unsigned int *)((char *)&((dummy)->pcm_hw) + (ofs)) #define get_dummy_ll_ptr(dummy, ofs) \ (unsigned long long *)((char *)&((dummy)->pcm_hw) + (ofs)) struct dummy_hw_field { const char *name; const char *format; unsigned int offset; unsigned int size; }; #define FIELD_ENTRY(item, fmt) { \ .name = #item, \ .format = fmt, \ .offset = offsetof(struct snd_pcm_hardware, item), \ .size = sizeof(dummy_pcm_hardware.item) } static struct dummy_hw_field fields[] = { FIELD_ENTRY(formats, "%#llx"), FIELD_ENTRY(rates, "%#x"), FIELD_ENTRY(rate_min, "%d"), FIELD_ENTRY(rate_max, "%d"), FIELD_ENTRY(channels_min, "%d"), FIELD_ENTRY(channels_max, "%d"), FIELD_ENTRY(buffer_bytes_max, "%ld"), FIELD_ENTRY(period_bytes_min, "%ld"), FIELD_ENTRY(period_bytes_max, "%ld"), FIELD_ENTRY(periods_min, "%d"), FIELD_ENTRY(periods_max, "%d"), }; static void dummy_proc_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_dummy *dummy = entry->private_data; int i; for (i = 0; i < ARRAY_SIZE(fields); i++) { snd_iprintf(buffer, "%s ", fields[i].name); if (fields[i].size == sizeof(int)) snd_iprintf(buffer, fields[i].format, *get_dummy_int_ptr(dummy, fields[i].offset)); else snd_iprintf(buffer, fields[i].format, *get_dummy_ll_ptr(dummy, fields[i].offset)); if (!strcmp(fields[i].name, "formats")) print_formats(dummy, buffer); else if (!strcmp(fields[i].name, "rates")) print_rates(dummy, buffer); snd_iprintf(buffer, "\n"); } } static void dummy_proc_write(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { struct snd_dummy *dummy = entry->private_data; char line[64]; while (!snd_info_get_line(buffer, line, sizeof(line))) { char item[20]; const char *ptr; unsigned long long val; int i; ptr = snd_info_get_str(item, line, sizeof(item)); for (i = 0; i < ARRAY_SIZE(fields); i++) { if (!strcmp(item, fields[i].name)) break; } if (i >= ARRAY_SIZE(fields)) continue; snd_info_get_str(item, ptr, sizeof(item)); if (kstrtoull(item, 0, &val)) continue; if (fields[i].size == sizeof(int)) *get_dummy_int_ptr(dummy, fields[i].offset) = val; else *get_dummy_ll_ptr(dummy, fields[i].offset) = val; } } static void dummy_proc_init(struct snd_dummy *chip) { struct snd_info_entry *entry; if (!snd_card_proc_new(chip->card, "dummy_pcm", &entry)) { snd_info_set_text_ops(entry, chip, dummy_proc_read); entry->c.text.write = dummy_proc_write; entry->mode |= S_IWUSR; entry->private_data = chip; } } #else #define dummy_proc_init(x) #endif /* CONFIG_SND_DEBUG && CONFIG_SND_PROC_FS */ static int snd_dummy_probe(struct platform_device *devptr) { struct snd_card *card; struct snd_dummy *dummy; struct dummy_model *m = NULL, **mdl; int idx, err; int dev = devptr->id; err = snd_card_new(&devptr->dev, index[dev], id[dev], THIS_MODULE, sizeof(struct snd_dummy), &card); if (err < 0) return err; dummy = card->private_data; dummy->card = card; for (mdl = dummy_models; *mdl && model[dev]; mdl++) { if (strcmp(model[dev], (*mdl)->name) == 0) { printk(KERN_INFO "snd-dummy: Using model '%s' for card %i\n", (*mdl)->name, card->number); m = dummy->model = *mdl; break; } } for (idx = 0; idx < MAX_PCM_DEVICES && idx < pcm_devs[dev]; idx++) { if (pcm_substreams[dev] < 1) pcm_substreams[dev] = 1; if (pcm_substreams[dev] > MAX_PCM_SUBSTREAMS) pcm_substreams[dev] = MAX_PCM_SUBSTREAMS; err = snd_card_dummy_pcm(dummy, idx, pcm_substreams[dev]); if (err < 0) goto __nodev; } dummy->pcm_hw = dummy_pcm_hardware; if (m) { if (m->formats) dummy->pcm_hw.formats = m->formats; if (m->buffer_bytes_max) dummy->pcm_hw.buffer_bytes_max = m->buffer_bytes_max; if (m->period_bytes_min) dummy->pcm_hw.period_bytes_min = m->period_bytes_min; if (m->period_bytes_max) dummy->pcm_hw.period_bytes_max = m->period_bytes_max; if (m->periods_min) dummy->pcm_hw.periods_min = m->periods_min; if (m->periods_max) dummy->pcm_hw.periods_max = m->periods_max; if (m->rates) dummy->pcm_hw.rates = m->rates; if (m->rate_min) dummy->pcm_hw.rate_min = m->rate_min; if (m->rate_max) dummy->pcm_hw.rate_max = m->rate_max; if (m->channels_min) dummy->pcm_hw.channels_min = m->channels_min; if (m->channels_max) dummy->pcm_hw.channels_max = m->channels_max; } err = snd_card_dummy_new_mixer(dummy); if (err < 0) goto __nodev; strcpy(card->driver, "Dummy"); strcpy(card->shortname, "Dummy"); sprintf(card->longname, "Dummy %i", dev + 1); dummy_proc_init(dummy); err = snd_card_register(card); if (err == 0) { platform_set_drvdata(devptr, card); return 0; } __nodev: snd_card_free(card); return err; } static int snd_dummy_remove(struct platform_device *devptr) { snd_card_free(platform_get_drvdata(devptr)); return 0; } #ifdef CONFIG_PM_SLEEP static int snd_dummy_suspend(struct device *pdev) { struct snd_card *card = dev_get_drvdata(pdev); struct snd_dummy *dummy = card->private_data; snd_power_change_state(card, SNDRV_CTL_POWER_D3hot); snd_pcm_suspend_all(dummy->pcm); return 0; } static int snd_dummy_resume(struct device *pdev) { struct snd_card *card = dev_get_drvdata(pdev); snd_power_change_state(card, SNDRV_CTL_POWER_D0); return 0; } static SIMPLE_DEV_PM_OPS(snd_dummy_pm, snd_dummy_suspend, snd_dummy_resume); #define SND_DUMMY_PM_OPS &snd_dummy_pm #else #define SND_DUMMY_PM_OPS NULL #endif #define SND_DUMMY_DRIVER "snd_dummy" static struct platform_driver snd_dummy_driver = { .probe = snd_dummy_probe, .remove = snd_dummy_remove, .driver = { .name = SND_DUMMY_DRIVER, .pm = SND_DUMMY_PM_OPS, }, }; static void snd_dummy_unregister_all(void) { int i; for (i = 0; i < ARRAY_SIZE(devices); ++i) platform_device_unregister(devices[i]); platform_driver_unregister(&snd_dummy_driver); free_fake_buffer(); } static int __init alsa_card_dummy_init(void) { int i, cards, err; err = platform_driver_register(&snd_dummy_driver); if (err < 0) return err; err = alloc_fake_buffer(); if (err < 0) { platform_driver_unregister(&snd_dummy_driver); return err; } cards = 0; for (i = 0; i < SNDRV_CARDS; i++) { struct platform_device *device; if (! enable[i]) continue; device = platform_device_register_simple(SND_DUMMY_DRIVER, i, NULL, 0); if (IS_ERR(device)) continue; if (!platform_get_drvdata(device)) { platform_device_unregister(device); continue; } devices[i] = device; cards++; } if (!cards) { #ifdef MODULE printk(KERN_ERR "Dummy soundcard not found or device busy\n"); #endif snd_dummy_unregister_all(); return -ENODEV; } return 0; } static void __exit alsa_card_dummy_exit(void) { snd_dummy_unregister_all(); } module_init(alsa_card_dummy_init) module_exit(alsa_card_dummy_exit)
2 2 14 8 8 16 13 13 13 3 1 1 1 16 14 14 14 2 14 14 14 14 19 18 1 17 16 16 16 14 2 22 19 3 4 3 28 2 2 2 1 28 14 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 /* RFCOMM implementation for Linux Bluetooth stack (BlueZ). Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* * RFCOMM TTY. */ #include <linux/module.h> #include <linux/tty.h> #include <linux/tty_driver.h> #include <linux/tty_flip.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/rfcomm.h> #define RFCOMM_TTY_MAGIC 0x6d02 /* magic number for rfcomm struct */ #define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV /* whole lotta rfcomm devices */ #define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */ #define RFCOMM_TTY_MINOR 0 static DEFINE_MUTEX(rfcomm_ioctl_mutex); static struct tty_driver *rfcomm_tty_driver; struct rfcomm_dev { struct tty_port port; struct list_head list; char name[12]; int id; unsigned long flags; int err; unsigned long status; /* don't export to userspace */ bdaddr_t src; bdaddr_t dst; u8 channel; uint modem_status; struct rfcomm_dlc *dlc; struct device *tty_dev; atomic_t wmem_alloc; struct sk_buff_head pending; }; static LIST_HEAD(rfcomm_dev_list); static DEFINE_MUTEX(rfcomm_dev_lock); static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb); static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err); static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig); /* ---- Device functions ---- */ static void rfcomm_dev_destruct(struct tty_port *port) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); struct rfcomm_dlc *dlc = dev->dlc; BT_DBG("dev %p dlc %p", dev, dlc); rfcomm_dlc_lock(dlc); /* Detach DLC if it's owned by this dev */ if (dlc->owner == dev) dlc->owner = NULL; rfcomm_dlc_unlock(dlc); rfcomm_dlc_put(dlc); if (dev->tty_dev) tty_unregister_device(rfcomm_tty_driver, dev->id); mutex_lock(&rfcomm_dev_lock); list_del(&dev->list); mutex_unlock(&rfcomm_dev_lock); kfree(dev); /* It's safe to call module_put() here because socket still holds reference to this module. */ module_put(THIS_MODULE); } /* device-specific initialization: open the dlc */ static int rfcomm_dev_activate(struct tty_port *port, struct tty_struct *tty) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); int err; err = rfcomm_dlc_open(dev->dlc, &dev->src, &dev->dst, dev->channel); if (err) set_bit(TTY_IO_ERROR, &tty->flags); return err; } /* we block the open until the dlc->state becomes BT_CONNECTED */ static int rfcomm_dev_carrier_raised(struct tty_port *port) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); return (dev->dlc->state == BT_CONNECTED); } /* device-specific cleanup: close the dlc */ static void rfcomm_dev_shutdown(struct tty_port *port) { struct rfcomm_dev *dev = container_of(port, struct rfcomm_dev, port); if (dev->tty_dev->parent) device_move(dev->tty_dev, NULL, DPM_ORDER_DEV_LAST); /* close the dlc */ rfcomm_dlc_close(dev->dlc, 0); } static const struct tty_port_operations rfcomm_port_ops = { .destruct = rfcomm_dev_destruct, .activate = rfcomm_dev_activate, .shutdown = rfcomm_dev_shutdown, .carrier_raised = rfcomm_dev_carrier_raised, }; static struct rfcomm_dev *__rfcomm_dev_lookup(int id) { struct rfcomm_dev *dev; list_for_each_entry(dev, &rfcomm_dev_list, list) if (dev->id == id) return dev; return NULL; } static struct rfcomm_dev *rfcomm_dev_get(int id) { struct rfcomm_dev *dev; mutex_lock(&rfcomm_dev_lock); dev = __rfcomm_dev_lookup(id); if (dev && !tty_port_get(&dev->port)) dev = NULL; mutex_unlock(&rfcomm_dev_lock); return dev; } static void rfcomm_reparent_device(struct rfcomm_dev *dev) { struct hci_dev *hdev; struct hci_conn *conn; hdev = hci_get_route(&dev->dst, &dev->src, BDADDR_BREDR); if (!hdev) return; /* The lookup results are unsafe to access without the * hci device lock (FIXME: why is this not documented?) */ hci_dev_lock(hdev); conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &dev->dst); /* Just because the acl link is in the hash table is no * guarantee the sysfs device has been added ... */ if (conn && device_is_registered(&conn->dev)) device_move(dev->tty_dev, &conn->dev, DPM_ORDER_DEV_AFTER_PARENT); hci_dev_unlock(hdev); hci_dev_put(hdev); } static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); return sprintf(buf, "%pMR\n", &dev->dst); } static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); return sprintf(buf, "%d\n", dev->channel); } static DEVICE_ATTR(address, S_IRUGO, show_address, NULL); static DEVICE_ATTR(channel, S_IRUGO, show_channel, NULL); static struct rfcomm_dev *__rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) { struct rfcomm_dev *dev, *entry; struct list_head *head = &rfcomm_dev_list; int err = 0; dev = kzalloc(sizeof(struct rfcomm_dev), GFP_KERNEL); if (!dev) return ERR_PTR(-ENOMEM); mutex_lock(&rfcomm_dev_lock); if (req->dev_id < 0) { dev->id = 0; list_for_each_entry(entry, &rfcomm_dev_list, list) { if (entry->id != dev->id) break; dev->id++; head = &entry->list; } } else { dev->id = req->dev_id; list_for_each_entry(entry, &rfcomm_dev_list, list) { if (entry->id == dev->id) { err = -EADDRINUSE; goto out; } if (entry->id > dev->id - 1) break; head = &entry->list; } } if ((dev->id < 0) || (dev->id > RFCOMM_MAX_DEV - 1)) { err = -ENFILE; goto out; } sprintf(dev->name, "rfcomm%d", dev->id); list_add(&dev->list, head); bacpy(&dev->src, &req->src); bacpy(&dev->dst, &req->dst); dev->channel = req->channel; dev->flags = req->flags & ((1 << RFCOMM_RELEASE_ONHUP) | (1 << RFCOMM_REUSE_DLC)); tty_port_init(&dev->port); dev->port.ops = &rfcomm_port_ops; skb_queue_head_init(&dev->pending); rfcomm_dlc_lock(dlc); if (req->flags & (1 << RFCOMM_REUSE_DLC)) { struct sock *sk = dlc->owner; struct sk_buff *skb; BUG_ON(!sk); rfcomm_dlc_throttle(dlc); while ((skb = skb_dequeue(&sk->sk_receive_queue))) { skb_orphan(skb); skb_queue_tail(&dev->pending, skb); atomic_sub(skb->len, &sk->sk_rmem_alloc); } } dlc->data_ready = rfcomm_dev_data_ready; dlc->state_change = rfcomm_dev_state_change; dlc->modem_status = rfcomm_dev_modem_status; dlc->owner = dev; dev->dlc = dlc; rfcomm_dev_modem_status(dlc, dlc->remote_v24_sig); rfcomm_dlc_unlock(dlc); /* It's safe to call __module_get() here because socket already holds reference to this module. */ __module_get(THIS_MODULE); mutex_unlock(&rfcomm_dev_lock); return dev; out: mutex_unlock(&rfcomm_dev_lock); kfree(dev); return ERR_PTR(err); } static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) { struct rfcomm_dev *dev; struct device *tty; BT_DBG("id %d channel %d", req->dev_id, req->channel); dev = __rfcomm_dev_add(req, dlc); if (IS_ERR(dev)) { rfcomm_dlc_put(dlc); return PTR_ERR(dev); } tty = tty_port_register_device(&dev->port, rfcomm_tty_driver, dev->id, NULL); if (IS_ERR(tty)) { tty_port_put(&dev->port); return PTR_ERR(tty); } dev->tty_dev = tty; rfcomm_reparent_device(dev); dev_set_drvdata(dev->tty_dev, dev); if (device_create_file(dev->tty_dev, &dev_attr_address) < 0) BT_ERR("Failed to create address attribute"); if (device_create_file(dev->tty_dev, &dev_attr_channel) < 0) BT_ERR("Failed to create channel attribute"); return dev->id; } /* ---- Send buffer ---- */ static inline unsigned int rfcomm_room(struct rfcomm_dev *dev) { struct rfcomm_dlc *dlc = dev->dlc; /* Limit the outstanding number of packets not yet sent to 40 */ int pending = 40 - atomic_read(&dev->wmem_alloc); return max(0, pending) * dlc->mtu; } static void rfcomm_wfree(struct sk_buff *skb) { struct rfcomm_dev *dev = (void *) skb->sk; atomic_dec(&dev->wmem_alloc); if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags)) tty_port_tty_wakeup(&dev->port); tty_port_put(&dev->port); } static void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev) { tty_port_get(&dev->port); atomic_inc(&dev->wmem_alloc); skb->sk = (void *) dev; skb->destructor = rfcomm_wfree; } static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, gfp_t priority) { struct sk_buff *skb = alloc_skb(size, priority); if (skb) rfcomm_set_owner_w(skb, dev); return skb; } /* ---- Device IOCTLs ---- */ #define NOCAP_FLAGS ((1 << RFCOMM_REUSE_DLC) | (1 << RFCOMM_RELEASE_ONHUP)) static int __rfcomm_create_dev(struct sock *sk, void __user *arg) { struct rfcomm_dev_req req; struct rfcomm_dlc *dlc; int id; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; BT_DBG("sk %p dev_id %d flags 0x%x", sk, req.dev_id, req.flags); if (req.flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) return -EPERM; if (req.flags & (1 << RFCOMM_REUSE_DLC)) { /* Socket must be connected */ if (sk->sk_state != BT_CONNECTED) return -EBADFD; dlc = rfcomm_pi(sk)->dlc; rfcomm_dlc_hold(dlc); } else { /* Validate the channel is unused */ dlc = rfcomm_dlc_exists(&req.src, &req.dst, req.channel); if (IS_ERR(dlc)) return PTR_ERR(dlc); if (dlc) return -EBUSY; dlc = rfcomm_dlc_alloc(GFP_KERNEL); if (!dlc) return -ENOMEM; } id = rfcomm_dev_add(&req, dlc); if (id < 0) return id; if (req.flags & (1 << RFCOMM_REUSE_DLC)) { /* DLC is now used by device. * Socket must be disconnected */ sk->sk_state = BT_CLOSED; } return id; } static int __rfcomm_release_dev(void __user *arg) { struct rfcomm_dev_req req; struct rfcomm_dev *dev; struct tty_struct *tty; if (copy_from_user(&req, arg, sizeof(req))) return -EFAULT; BT_DBG("dev_id %d flags 0x%x", req.dev_id, req.flags); dev = rfcomm_dev_get(req.dev_id); if (!dev) return -ENODEV; if (dev->flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) { tty_port_put(&dev->port); return -EPERM; } /* only release once */ if (test_and_set_bit(RFCOMM_DEV_RELEASED, &dev->status)) { tty_port_put(&dev->port); return -EALREADY; } if (req.flags & (1 << RFCOMM_HANGUP_NOW)) rfcomm_dlc_close(dev->dlc, 0); /* Shut down TTY synchronously before freeing rfcomm_dev */ tty = tty_port_tty_get(&dev->port); if (tty) { tty_vhangup(tty); tty_kref_put(tty); } if (!test_bit(RFCOMM_TTY_OWNED, &dev->status)) tty_port_put(&dev->port); tty_port_put(&dev->port); return 0; } static int rfcomm_create_dev(struct sock *sk, void __user *arg) { int ret; mutex_lock(&rfcomm_ioctl_mutex); ret = __rfcomm_create_dev(sk, arg); mutex_unlock(&rfcomm_ioctl_mutex); return ret; } static int rfcomm_release_dev(void __user *arg) { int ret; mutex_lock(&rfcomm_ioctl_mutex); ret = __rfcomm_release_dev(arg); mutex_unlock(&rfcomm_ioctl_mutex); return ret; } static int rfcomm_get_dev_list(void __user *arg) { struct rfcomm_dev *dev; struct rfcomm_dev_list_req *dl; struct rfcomm_dev_info *di; int n = 0, size, err; u16 dev_num; BT_DBG(""); if (get_user(dev_num, (u16 __user *) arg)) return -EFAULT; if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di)) return -EINVAL; size = sizeof(*dl) + dev_num * sizeof(*di); dl = kzalloc(size, GFP_KERNEL); if (!dl) return -ENOMEM; di = dl->dev_info; mutex_lock(&rfcomm_dev_lock); list_for_each_entry(dev, &rfcomm_dev_list, list) { if (!tty_port_get(&dev->port)) continue; (di + n)->id = dev->id; (di + n)->flags = dev->flags; (di + n)->state = dev->dlc->state; (di + n)->channel = dev->channel; bacpy(&(di + n)->src, &dev->src); bacpy(&(di + n)->dst, &dev->dst); tty_port_put(&dev->port); if (++n >= dev_num) break; } mutex_unlock(&rfcomm_dev_lock); dl->dev_num = n; size = sizeof(*dl) + n * sizeof(*di); err = copy_to_user(arg, dl, size); kfree(dl); return err ? -EFAULT : 0; } static int rfcomm_get_dev_info(void __user *arg) { struct rfcomm_dev *dev; struct rfcomm_dev_info di; int err = 0; BT_DBG(""); if (copy_from_user(&di, arg, sizeof(di))) return -EFAULT; dev = rfcomm_dev_get(di.id); if (!dev) return -ENODEV; di.flags = dev->flags; di.channel = dev->channel; di.state = dev->dlc->state; bacpy(&di.src, &dev->src); bacpy(&di.dst, &dev->dst); if (copy_to_user(arg, &di, sizeof(di))) err = -EFAULT; tty_port_put(&dev->port); return err; } int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { BT_DBG("cmd %d arg %p", cmd, arg); switch (cmd) { case RFCOMMCREATEDEV: return rfcomm_create_dev(sk, arg); case RFCOMMRELEASEDEV: return rfcomm_release_dev(arg); case RFCOMMGETDEVLIST: return rfcomm_get_dev_list(arg); case RFCOMMGETDEVINFO: return rfcomm_get_dev_info(arg); } return -EINVAL; } /* ---- DLC callbacks ---- */ static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb) { struct rfcomm_dev *dev = dlc->owner; if (!dev) { kfree_skb(skb); return; } if (!skb_queue_empty(&dev->pending)) { skb_queue_tail(&dev->pending, skb); return; } BT_DBG("dlc %p len %d", dlc, skb->len); tty_insert_flip_string(&dev->port, skb->data, skb->len); tty_flip_buffer_push(&dev->port); kfree_skb(skb); } static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err) { struct rfcomm_dev *dev = dlc->owner; if (!dev) return; BT_DBG("dlc %p dev %p err %d", dlc, dev, err); dev->err = err; if (dlc->state == BT_CONNECTED) { rfcomm_reparent_device(dev); wake_up_interruptible(&dev->port.open_wait); } else if (dlc->state == BT_CLOSED) tty_port_tty_hangup(&dev->port, false); } static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) { struct rfcomm_dev *dev = dlc->owner; if (!dev) return; BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig); if ((dev->modem_status & TIOCM_CD) && !(v24_sig & RFCOMM_V24_DV)) tty_port_tty_hangup(&dev->port, true); dev->modem_status = ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | ((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) | ((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0); } /* ---- TTY functions ---- */ static void rfcomm_tty_copy_pending(struct rfcomm_dev *dev) { struct sk_buff *skb; int inserted = 0; BT_DBG("dev %p", dev); rfcomm_dlc_lock(dev->dlc); while ((skb = skb_dequeue(&dev->pending))) { inserted += tty_insert_flip_string(&dev->port, skb->data, skb->len); kfree_skb(skb); } rfcomm_dlc_unlock(dev->dlc); if (inserted > 0) tty_flip_buffer_push(&dev->port); } /* do the reverse of install, clearing the tty fields and releasing the * reference to tty_port */ static void rfcomm_tty_cleanup(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags); rfcomm_dlc_lock(dev->dlc); tty->driver_data = NULL; rfcomm_dlc_unlock(dev->dlc); /* * purge the dlc->tx_queue to avoid circular dependencies * between dev and dlc */ skb_queue_purge(&dev->dlc->tx_queue); tty_port_put(&dev->port); } /* we acquire the tty_port reference since it's here the tty is first used * by setting the termios. We also populate the driver_data field and install * the tty port */ static int rfcomm_tty_install(struct tty_driver *driver, struct tty_struct *tty) { struct rfcomm_dev *dev; struct rfcomm_dlc *dlc; int err; dev = rfcomm_dev_get(tty->index); if (!dev) return -ENODEV; dlc = dev->dlc; /* Attach TTY and open DLC */ rfcomm_dlc_lock(dlc); tty->driver_data = dev; rfcomm_dlc_unlock(dlc); set_bit(RFCOMM_TTY_ATTACHED, &dev->flags); /* install the tty_port */ err = tty_port_install(&dev->port, driver, tty); if (err) { rfcomm_tty_cleanup(tty); return err; } /* take over the tty_port reference if the port was created with the * flag RFCOMM_RELEASE_ONHUP. This will force the release of the port * when the last process closes the tty. The behaviour is expected by * userspace. */ if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) { set_bit(RFCOMM_TTY_OWNED, &dev->status); tty_port_put(&dev->port); } return 0; } static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) { struct rfcomm_dev *dev = tty->driver_data; int err; BT_DBG("tty %p id %d", tty, tty->index); BT_DBG("dev %p dst %pMR channel %d opened %d", dev, &dev->dst, dev->channel, dev->port.count); err = tty_port_open(&dev->port, tty, filp); if (err) return err; /* * FIXME: rfcomm should use proper flow control for * received data. This hack will be unnecessary and can * be removed when that's implemented */ rfcomm_tty_copy_pending(dev); rfcomm_dlc_unthrottle(dev->dlc); return 0; } static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc, dev->port.count); tty_port_close(&dev->port, tty, filp); } static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; struct rfcomm_dlc *dlc = dev->dlc; struct sk_buff *skb; int sent = 0, size; BT_DBG("tty %p count %d", tty, count); while (count) { size = min_t(uint, count, dlc->mtu); skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC); if (!skb) break; skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); skb_put_data(skb, buf + sent, size); rfcomm_dlc_send_noerror(dlc, skb); sent += size; count -= size; } return sent; } static int rfcomm_tty_write_room(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; int room = 0; if (dev && dev->dlc) room = rfcomm_room(dev); BT_DBG("tty %p room %d", tty, room); return room; } static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { BT_DBG("tty %p cmd 0x%02x", tty, cmd); switch (cmd) { case TCGETS: BT_DBG("TCGETS is not supported"); return -ENOIOCTLCMD; case TCSETS: BT_DBG("TCSETS is not supported"); return -ENOIOCTLCMD; case TIOCMIWAIT: BT_DBG("TIOCMIWAIT"); break; case TIOCGSERIAL: BT_ERR("TIOCGSERIAL is not supported"); return -ENOIOCTLCMD; case TIOCSSERIAL: BT_ERR("TIOCSSERIAL is not supported"); return -ENOIOCTLCMD; case TIOCSERGSTRUCT: BT_ERR("TIOCSERGSTRUCT is not supported"); return -ENOIOCTLCMD; case TIOCSERGETLSR: BT_ERR("TIOCSERGETLSR is not supported"); return -ENOIOCTLCMD; case TIOCSERCONFIG: BT_ERR("TIOCSERCONFIG is not supported"); return -ENOIOCTLCMD; default: return -ENOIOCTLCMD; /* ioctls which we must ignore */ } return -ENOIOCTLCMD; } static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old) { struct ktermios *new = &tty->termios; int old_baud_rate = tty_termios_baud_rate(old); int new_baud_rate = tty_termios_baud_rate(new); u8 baud, data_bits, stop_bits, parity, x_on, x_off; u16 changes = 0; struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p termios %p", tty, old); if (!dev || !dev->dlc || !dev->dlc->session) return; /* Handle turning off CRTSCTS */ if ((old->c_cflag & CRTSCTS) && !(new->c_cflag & CRTSCTS)) BT_DBG("Turning off CRTSCTS unsupported"); /* Parity on/off and when on, odd/even */ if (((old->c_cflag & PARENB) != (new->c_cflag & PARENB)) || ((old->c_cflag & PARODD) != (new->c_cflag & PARODD))) { changes |= RFCOMM_RPN_PM_PARITY; BT_DBG("Parity change detected."); } /* Mark and space parity are not supported! */ if (new->c_cflag & PARENB) { if (new->c_cflag & PARODD) { BT_DBG("Parity is ODD"); parity = RFCOMM_RPN_PARITY_ODD; } else { BT_DBG("Parity is EVEN"); parity = RFCOMM_RPN_PARITY_EVEN; } } else { BT_DBG("Parity is OFF"); parity = RFCOMM_RPN_PARITY_NONE; } /* Setting the x_on / x_off characters */ if (old->c_cc[VSTOP] != new->c_cc[VSTOP]) { BT_DBG("XOFF custom"); x_on = new->c_cc[VSTOP]; changes |= RFCOMM_RPN_PM_XON; } else { BT_DBG("XOFF default"); x_on = RFCOMM_RPN_XON_CHAR; } if (old->c_cc[VSTART] != new->c_cc[VSTART]) { BT_DBG("XON custom"); x_off = new->c_cc[VSTART]; changes |= RFCOMM_RPN_PM_XOFF; } else { BT_DBG("XON default"); x_off = RFCOMM_RPN_XOFF_CHAR; } /* Handle setting of stop bits */ if ((old->c_cflag & CSTOPB) != (new->c_cflag & CSTOPB)) changes |= RFCOMM_RPN_PM_STOP; /* POSIX does not support 1.5 stop bits and RFCOMM does not * support 2 stop bits. So a request for 2 stop bits gets * translated to 1.5 stop bits */ if (new->c_cflag & CSTOPB) stop_bits = RFCOMM_RPN_STOP_15; else stop_bits = RFCOMM_RPN_STOP_1; /* Handle number of data bits [5-8] */ if ((old->c_cflag & CSIZE) != (new->c_cflag & CSIZE)) changes |= RFCOMM_RPN_PM_DATA; switch (new->c_cflag & CSIZE) { case CS5: data_bits = RFCOMM_RPN_DATA_5; break; case CS6: data_bits = RFCOMM_RPN_DATA_6; break; case CS7: data_bits = RFCOMM_RPN_DATA_7; break; case CS8: data_bits = RFCOMM_RPN_DATA_8; break; default: data_bits = RFCOMM_RPN_DATA_8; break; } /* Handle baudrate settings */ if (old_baud_rate != new_baud_rate) changes |= RFCOMM_RPN_PM_BITRATE; switch (new_baud_rate) { case 2400: baud = RFCOMM_RPN_BR_2400; break; case 4800: baud = RFCOMM_RPN_BR_4800; break; case 7200: baud = RFCOMM_RPN_BR_7200; break; case 9600: baud = RFCOMM_RPN_BR_9600; break; case 19200: baud = RFCOMM_RPN_BR_19200; break; case 38400: baud = RFCOMM_RPN_BR_38400; break; case 57600: baud = RFCOMM_RPN_BR_57600; break; case 115200: baud = RFCOMM_RPN_BR_115200; break; case 230400: baud = RFCOMM_RPN_BR_230400; break; default: /* 9600 is standard accordinag to the RFCOMM specification */ baud = RFCOMM_RPN_BR_9600; break; } if (changes) rfcomm_send_rpn(dev->dlc->session, 1, dev->dlc->dlci, baud, data_bits, stop_bits, parity, RFCOMM_RPN_FLOW_NONE, x_on, x_off, changes); } static void rfcomm_tty_throttle(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); rfcomm_dlc_throttle(dev->dlc); } static void rfcomm_tty_unthrottle(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); rfcomm_dlc_unthrottle(dev->dlc); } static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); if (!dev || !dev->dlc) return 0; if (!skb_queue_empty(&dev->dlc->tx_queue)) return dev->dlc->mtu; return 0; } static void rfcomm_tty_flush_buffer(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); if (!dev || !dev->dlc) return; skb_queue_purge(&dev->dlc->tx_queue); tty_wakeup(tty); } static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch) { BT_DBG("tty %p ch %c", tty, ch); } static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) { BT_DBG("tty %p timeout %d", tty, timeout); } static void rfcomm_tty_hangup(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); tty_port_hangup(&dev->port); } static int rfcomm_tty_tiocmget(struct tty_struct *tty) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; BT_DBG("tty %p dev %p", tty, dev); return dev->modem_status; } static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; struct rfcomm_dlc *dlc = dev->dlc; u8 v24_sig; BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear); rfcomm_dlc_get_modem_status(dlc, &v24_sig); if (set & TIOCM_DSR || set & TIOCM_DTR) v24_sig |= RFCOMM_V24_RTC; if (set & TIOCM_RTS || set & TIOCM_CTS) v24_sig |= RFCOMM_V24_RTR; if (set & TIOCM_RI) v24_sig |= RFCOMM_V24_IC; if (set & TIOCM_CD) v24_sig |= RFCOMM_V24_DV; if (clear & TIOCM_DSR || clear & TIOCM_DTR) v24_sig &= ~RFCOMM_V24_RTC; if (clear & TIOCM_RTS || clear & TIOCM_CTS) v24_sig &= ~RFCOMM_V24_RTR; if (clear & TIOCM_RI) v24_sig &= ~RFCOMM_V24_IC; if (clear & TIOCM_CD) v24_sig &= ~RFCOMM_V24_DV; rfcomm_dlc_set_modem_status(dlc, v24_sig); return 0; } /* ---- TTY structure ---- */ static const struct tty_operations rfcomm_ops = { .open = rfcomm_tty_open, .close = rfcomm_tty_close, .write = rfcomm_tty_write, .write_room = rfcomm_tty_write_room, .chars_in_buffer = rfcomm_tty_chars_in_buffer, .flush_buffer = rfcomm_tty_flush_buffer, .ioctl = rfcomm_tty_ioctl, .throttle = rfcomm_tty_throttle, .unthrottle = rfcomm_tty_unthrottle, .set_termios = rfcomm_tty_set_termios, .send_xchar = rfcomm_tty_send_xchar, .hangup = rfcomm_tty_hangup, .wait_until_sent = rfcomm_tty_wait_until_sent, .tiocmget = rfcomm_tty_tiocmget, .tiocmset = rfcomm_tty_tiocmset, .install = rfcomm_tty_install, .cleanup = rfcomm_tty_cleanup, }; int __init rfcomm_init_ttys(void) { int error; rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS); if (!rfcomm_tty_driver) return -ENOMEM; rfcomm_tty_driver->driver_name = "rfcomm"; rfcomm_tty_driver->name = "rfcomm"; rfcomm_tty_driver->major = RFCOMM_TTY_MAJOR; rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR; rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL; rfcomm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV; rfcomm_tty_driver->init_termios = tty_std_termios; rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL; rfcomm_tty_driver->init_termios.c_lflag &= ~ICANON; tty_set_operations(rfcomm_tty_driver, &rfcomm_ops); error = tty_register_driver(rfcomm_tty_driver); if (error) { BT_ERR("Can't register RFCOMM TTY driver"); put_tty_driver(rfcomm_tty_driver); return error; } BT_INFO("RFCOMM TTY layer initialized"); return 0; } void rfcomm_cleanup_ttys(void) { tty_unregister_driver(rfcomm_tty_driver); put_tty_driver(rfcomm_tty_driver); }
2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 /* * inode.c -- user mode filesystem api for usb gadget controllers * * Copyright (C) 2003-2004 David Brownell * Copyright (C) 2003 Agilent Technologies * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ /* #define VERBOSE_DEBUG */ #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/uts.h> #include <linux/wait.h> #include <linux/compiler.h> #include <linux/uaccess.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/mmu_context.h> #include <linux/aio.h> #include <linux/uio.h> #include <linux/refcount.h> #include <linux/delay.h> #include <linux/device.h> #include <linux/moduleparam.h> #include <linux/usb/gadgetfs.h> #include <linux/usb/gadget.h> /* * The gadgetfs API maps each endpoint to a file descriptor so that you * can use standard synchronous read/write calls for I/O. There's some * O_NONBLOCK and O_ASYNC/FASYNC style i/o support. Example usermode * drivers show how this works in practice. You can also use AIO to * eliminate I/O gaps between requests, to help when streaming data. * * Key parts that must be USB-specific are protocols defining how the * read/write operations relate to the hardware state machines. There * are two types of files. One type is for the device, implementing ep0. * The other type is for each IN or OUT endpoint. In both cases, the * user mode driver must configure the hardware before using it. * * - First, dev_config() is called when /dev/gadget/$CHIP is configured * (by writing configuration and device descriptors). Afterwards it * may serve as a source of device events, used to handle all control * requests other than basic enumeration. * * - Then, after a SET_CONFIGURATION control request, ep_config() is * called when each /dev/gadget/ep* file is configured (by writing * endpoint descriptors). Afterwards these files are used to write() * IN data or to read() OUT data. To halt the endpoint, a "wrong * direction" request is issued (like reading an IN endpoint). * * Unlike "usbfs" the only ioctl()s are for things that are rare, and maybe * not possible on all hardware. For example, precise fault handling with * respect to data left in endpoint fifos after aborted operations; or * selective clearing of endpoint halts, to implement SET_INTERFACE. */ #define DRIVER_DESC "USB Gadget filesystem" #define DRIVER_VERSION "24 Aug 2004" static const char driver_desc [] = DRIVER_DESC; static const char shortname [] = "gadgetfs"; MODULE_DESCRIPTION (DRIVER_DESC); MODULE_AUTHOR ("David Brownell"); MODULE_LICENSE ("GPL"); static int ep_open(struct inode *, struct file *); /*----------------------------------------------------------------------*/ #define GADGETFS_MAGIC 0xaee71ee7 /* /dev/gadget/$CHIP represents ep0 and the whole device */ enum ep0_state { /* DISABLED is the initial state. */ STATE_DEV_DISABLED = 0, /* Only one open() of /dev/gadget/$CHIP; only one file tracks * ep0/device i/o modes and binding to the controller. Driver * must always write descriptors to initialize the device, then * the device becomes UNCONNECTED until enumeration. */ STATE_DEV_OPENED, /* From then on, ep0 fd is in either of two basic modes: * - (UN)CONNECTED: read usb_gadgetfs_event(s) from it * - SETUP: read/write will transfer control data and succeed; * or if "wrong direction", performs protocol stall */ STATE_DEV_UNCONNECTED, STATE_DEV_CONNECTED, STATE_DEV_SETUP, /* UNBOUND means the driver closed ep0, so the device won't be * accessible again (DEV_DISABLED) until all fds are closed. */ STATE_DEV_UNBOUND, }; /* enough for the whole queue: most events invalidate others */ #define N_EVENT 5 #define RBUF_SIZE 256 struct dev_data { spinlock_t lock; refcount_t count; int udc_usage; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; struct fasync_struct *fasync; u8 current_config; /* drivers reading ep0 MUST handle control requests (SETUP) * reported that way; else the host will time out. */ unsigned usermode_setup : 1, setup_in : 1, setup_can_stall : 1, setup_out_ready : 1, setup_out_error : 1, setup_abort : 1, gadget_registered : 1; unsigned setup_wLength; /* the rest is basically write-once */ struct usb_config_descriptor *config, *hs_config; struct usb_device_descriptor *dev; struct usb_request *req; struct usb_gadget *gadget; struct list_head epfiles; void *buf; wait_queue_head_t wait; struct super_block *sb; struct dentry *dentry; /* except this scratch i/o buffer for ep0 */ u8 rbuf[RBUF_SIZE]; }; static inline void get_dev (struct dev_data *data) { refcount_inc (&data->count); } static void put_dev (struct dev_data *data) { if (likely (!refcount_dec_and_test (&data->count))) return; /* needs no more cleanup */ BUG_ON (waitqueue_active (&data->wait)); kfree (data); } static struct dev_data *dev_new (void) { struct dev_data *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; dev->state = STATE_DEV_DISABLED; refcount_set (&dev->count, 1); spin_lock_init (&dev->lock); INIT_LIST_HEAD (&dev->epfiles); init_waitqueue_head (&dev->wait); return dev; } /*----------------------------------------------------------------------*/ /* other /dev/gadget/$ENDPOINT files represent endpoints */ enum ep_state { STATE_EP_DISABLED = 0, STATE_EP_READY, STATE_EP_ENABLED, STATE_EP_UNBOUND, }; struct ep_data { struct mutex lock; enum ep_state state; refcount_t count; struct dev_data *dev; /* must hold dev->lock before accessing ep or req */ struct usb_ep *ep; struct usb_request *req; ssize_t status; char name [16]; struct usb_endpoint_descriptor desc, hs_desc; struct list_head epfiles; wait_queue_head_t wait; struct dentry *dentry; }; static inline void get_ep (struct ep_data *data) { refcount_inc (&data->count); } static void put_ep (struct ep_data *data) { if (likely (!refcount_dec_and_test (&data->count))) return; put_dev (data->dev); /* needs no more cleanup */ BUG_ON (!list_empty (&data->epfiles)); BUG_ON (waitqueue_active (&data->wait)); kfree (data); } /*----------------------------------------------------------------------*/ /* most "how to use the hardware" policy choices are in userspace: * mapping endpoint roles (which the driver needs) to the capabilities * which the usb controller has. most of those capabilities are exposed * implicitly, starting with the driver name and then endpoint names. */ static const char *CHIP; /*----------------------------------------------------------------------*/ /* NOTE: don't use dev_printk calls before binding to the gadget * at the end of ep0 configuration, or after unbind. */ /* too wordy: dev_printk(level , &(d)->gadget->dev , fmt , ## args) */ #define xprintk(d,level,fmt,args...) \ printk(level "%s: " fmt , shortname , ## args) #ifdef DEBUG #define DBG(dev,fmt,args...) \ xprintk(dev , KERN_DEBUG , fmt , ## args) #else #define DBG(dev,fmt,args...) \ do { } while (0) #endif /* DEBUG */ #ifdef VERBOSE_DEBUG #define VDEBUG DBG #else #define VDEBUG(dev,fmt,args...) \ do { } while (0) #endif /* DEBUG */ #define ERROR(dev,fmt,args...) \ xprintk(dev , KERN_ERR , fmt , ## args) #define INFO(dev,fmt,args...) \ xprintk(dev , KERN_INFO , fmt , ## args) /*----------------------------------------------------------------------*/ /* SYNCHRONOUS ENDPOINT OPERATIONS (bulk/intr/iso) * * After opening, configure non-control endpoints. Then use normal * stream read() and write() requests; and maybe ioctl() to get more * precise FIFO status when recovering from cancellation. */ static void epio_complete (struct usb_ep *ep, struct usb_request *req) { struct ep_data *epdata = ep->driver_data; if (!req->context) return; if (req->status) epdata->status = req->status; else epdata->status = req->actual; complete ((struct completion *)req->context); } /* tasklock endpoint, returning when it's connected. * still need dev->lock to use epdata->ep. */ static int get_ready_ep (unsigned f_flags, struct ep_data *epdata, bool is_write) { int val; if (f_flags & O_NONBLOCK) { if (!mutex_trylock(&epdata->lock)) goto nonblock; if (epdata->state != STATE_EP_ENABLED && (!is_write || epdata->state != STATE_EP_READY)) { mutex_unlock(&epdata->lock); nonblock: val = -EAGAIN; } else val = 0; return val; } val = mutex_lock_interruptible(&epdata->lock); if (val < 0) return val; switch (epdata->state) { case STATE_EP_ENABLED: return 0; case STATE_EP_READY: /* not configured yet */ if (is_write) return 0; // FALLTHRU case STATE_EP_UNBOUND: /* clean disconnect */ break; // case STATE_EP_DISABLED: /* "can't happen" */ default: /* error! */ pr_debug ("%s: ep %p not available, state %d\n", shortname, epdata, epdata->state); } mutex_unlock(&epdata->lock); return -ENODEV; } static ssize_t ep_io (struct ep_data *epdata, void *buf, unsigned len) { DECLARE_COMPLETION_ONSTACK (done); int value; spin_lock_irq (&epdata->dev->lock); if (likely (epdata->ep != NULL)) { struct usb_request *req = epdata->req; req->context = &done; req->complete = epio_complete; req->buf = buf; req->length = len; value = usb_ep_queue (epdata->ep, req, GFP_ATOMIC); } else value = -ENODEV; spin_unlock_irq (&epdata->dev->lock); if (likely (value == 0)) { value = wait_event_interruptible (done.wait, done.done); if (value != 0) { spin_lock_irq (&epdata->dev->lock); if (likely (epdata->ep != NULL)) { DBG (epdata->dev, "%s i/o interrupted\n", epdata->name); usb_ep_dequeue (epdata->ep, epdata->req); spin_unlock_irq (&epdata->dev->lock); wait_event (done.wait, done.done); if (epdata->status == -ECONNRESET) epdata->status = -EINTR; } else { spin_unlock_irq (&epdata->dev->lock); DBG (epdata->dev, "endpoint gone\n"); wait_for_completion(&done); epdata->status = -ENODEV; } } return epdata->status; } return value; } static int ep_release (struct inode *inode, struct file *fd) { struct ep_data *data = fd->private_data; int value; value = mutex_lock_interruptible(&data->lock); if (value < 0) return value; /* clean up if this can be reopened */ if (data->state != STATE_EP_UNBOUND) { data->state = STATE_EP_DISABLED; data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; usb_ep_disable(data->ep); } mutex_unlock(&data->lock); put_ep (data); return 0; } static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) { struct ep_data *data = fd->private_data; int status; if ((status = get_ready_ep (fd->f_flags, data, false)) < 0) return status; spin_lock_irq (&data->dev->lock); if (likely (data->ep != NULL)) { switch (code) { case GADGETFS_FIFO_STATUS: status = usb_ep_fifo_status (data->ep); break; case GADGETFS_FIFO_FLUSH: usb_ep_fifo_flush (data->ep); break; case GADGETFS_CLEAR_HALT: status = usb_ep_clear_halt (data->ep); break; default: status = -ENOTTY; } } else status = -ENODEV; spin_unlock_irq (&data->dev->lock); mutex_unlock(&data->lock); return status; } /*----------------------------------------------------------------------*/ /* ASYNCHRONOUS ENDPOINT I/O OPERATIONS (bulk/intr/iso) */ struct kiocb_priv { struct usb_request *req; struct ep_data *epdata; struct kiocb *iocb; struct mm_struct *mm; struct work_struct work; void *buf; struct iov_iter to; const void *to_free; unsigned actual; }; static int ep_aio_cancel(struct kiocb *iocb) { struct kiocb_priv *priv = iocb->private; struct ep_data *epdata; int value; local_irq_disable(); epdata = priv->epdata; // spin_lock(&epdata->dev->lock); if (likely(epdata && epdata->ep && priv->req)) value = usb_ep_dequeue (epdata->ep, priv->req); else value = -EINVAL; // spin_unlock(&epdata->dev->lock); local_irq_enable(); return value; } static void ep_user_copy_worker(struct work_struct *work) { struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work); struct mm_struct *mm = priv->mm; struct kiocb *iocb = priv->iocb; size_t ret; use_mm(mm); ret = copy_to_iter(priv->buf, priv->actual, &priv->to); unuse_mm(mm); if (!ret) ret = -EFAULT; /* completing the iocb can drop the ctx and mm, don't touch mm after */ iocb->ki_complete(iocb, ret, ret); kfree(priv->buf); kfree(priv->to_free); kfree(priv); } static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) { struct kiocb *iocb = req->context; struct kiocb_priv *priv = iocb->private; struct ep_data *epdata = priv->epdata; /* lock against disconnect (and ideally, cancel) */ spin_lock(&epdata->dev->lock); priv->req = NULL; priv->epdata = NULL; /* if this was a write or a read returning no data then we * don't need to copy anything to userspace, so we can * complete the aio request immediately. */ if (priv->to_free == NULL || unlikely(req->actual == 0)) { kfree(req->buf); kfree(priv->to_free); kfree(priv); iocb->private = NULL; /* aio_complete() reports bytes-transferred _and_ faults */ iocb->ki_complete(iocb, req->actual ? req->actual : req->status, req->status); } else { /* ep_copy_to_user() won't report both; we hide some faults */ if (unlikely(0 != req->status)) DBG(epdata->dev, "%s fault %d len %d\n", ep->name, req->status, req->actual); priv->buf = req->buf; priv->actual = req->actual; INIT_WORK(&priv->work, ep_user_copy_worker); schedule_work(&priv->work); } usb_ep_free_request(ep, req); spin_unlock(&epdata->dev->lock); put_ep(epdata); } static ssize_t ep_aio(struct kiocb *iocb, struct kiocb_priv *priv, struct ep_data *epdata, char *buf, size_t len) { struct usb_request *req; ssize_t value; iocb->private = priv; priv->iocb = iocb; kiocb_set_cancel_fn(iocb, ep_aio_cancel); get_ep(epdata); priv->epdata = epdata; priv->actual = 0; priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */ /* each kiocb is coupled to one usb_request, but we can't * allocate or submit those if the host disconnected. */ spin_lock_irq(&epdata->dev->lock); value = -ENODEV; if (unlikely(epdata->ep == NULL)) goto fail; req = usb_ep_alloc_request(epdata->ep, GFP_ATOMIC); value = -ENOMEM; if (unlikely(!req)) goto fail; priv->req = req; req->buf = buf; req->length = len; req->complete = ep_aio_complete; req->context = iocb; value = usb_ep_queue(epdata->ep, req, GFP_ATOMIC); if (unlikely(0 != value)) { usb_ep_free_request(epdata->ep, req); goto fail; } spin_unlock_irq(&epdata->dev->lock); return -EIOCBQUEUED; fail: spin_unlock_irq(&epdata->dev->lock); kfree(priv->to_free); kfree(priv); put_ep(epdata); return value; } static ssize_t ep_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct ep_data *epdata = file->private_data; size_t len = iov_iter_count(to); ssize_t value; char *buf; if ((value = get_ready_ep(file->f_flags, epdata, false)) < 0) return value; /* halt any endpoint by doing a "wrong direction" i/o call */ if (usb_endpoint_dir_in(&epdata->desc)) { if (usb_endpoint_xfer_isoc(&epdata->desc) || !is_sync_kiocb(iocb)) { mutex_unlock(&epdata->lock); return -EINVAL; } DBG (epdata->dev, "%s halt\n", epdata->name); spin_lock_irq(&epdata->dev->lock); if (likely(epdata->ep != NULL)) usb_ep_set_halt(epdata->ep); spin_unlock_irq(&epdata->dev->lock); mutex_unlock(&epdata->lock); return -EBADMSG; } buf = kmalloc(len, GFP_KERNEL); if (unlikely(!buf)) { mutex_unlock(&epdata->lock); return -ENOMEM; } if (is_sync_kiocb(iocb)) { value = ep_io(epdata, buf, len); if (value >= 0 && (copy_to_iter(buf, value, to) != value)) value = -EFAULT; } else { struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL); value = -ENOMEM; if (!priv) goto fail; priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL); if (!priv->to_free) { kfree(priv); goto fail; } value = ep_aio(iocb, priv, epdata, buf, len); if (value == -EIOCBQUEUED) buf = NULL; } fail: kfree(buf); mutex_unlock(&epdata->lock); return value; } static ssize_t ep_config(struct ep_data *, const char *, size_t); static ssize_t ep_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct ep_data *epdata = file->private_data; size_t len = iov_iter_count(from); bool configured; ssize_t value; char *buf; if ((value = get_ready_ep(file->f_flags, epdata, true)) < 0) return value; configured = epdata->state == STATE_EP_ENABLED; /* halt any endpoint by doing a "wrong direction" i/o call */ if (configured && !usb_endpoint_dir_in(&epdata->desc)) { if (usb_endpoint_xfer_isoc(&epdata->desc) || !is_sync_kiocb(iocb)) { mutex_unlock(&epdata->lock); return -EINVAL; } DBG (epdata->dev, "%s halt\n", epdata->name); spin_lock_irq(&epdata->dev->lock); if (likely(epdata->ep != NULL)) usb_ep_set_halt(epdata->ep); spin_unlock_irq(&epdata->dev->lock); mutex_unlock(&epdata->lock); return -EBADMSG; } buf = kmalloc(len, GFP_KERNEL); if (unlikely(!buf)) { mutex_unlock(&epdata->lock); return -ENOMEM; } if (unlikely(!copy_from_iter_full(buf, len, from))) { value = -EFAULT; goto out; } if (unlikely(!configured)) { value = ep_config(epdata, buf, len); } else if (is_sync_kiocb(iocb)) { value = ep_io(epdata, buf, len); } else { struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL); value = -ENOMEM; if (priv) { value = ep_aio(iocb, priv, epdata, buf, len); if (value == -EIOCBQUEUED) buf = NULL; } } out: kfree(buf); mutex_unlock(&epdata->lock); return value; } /*----------------------------------------------------------------------*/ /* used after endpoint configuration */ static const struct file_operations ep_io_operations = { .owner = THIS_MODULE, .open = ep_open, .release = ep_release, .llseek = no_llseek, .unlocked_ioctl = ep_ioctl, .read_iter = ep_read_iter, .write_iter = ep_write_iter, }; /* ENDPOINT INITIALIZATION * * fd = open ("/dev/gadget/$ENDPOINT", O_RDWR) * status = write (fd, descriptors, sizeof descriptors) * * That write establishes the endpoint configuration, configuring * the controller to process bulk, interrupt, or isochronous transfers * at the right maxpacket size, and so on. * * The descriptors are message type 1, identified by a host order u32 * at the beginning of what's written. Descriptor order is: full/low * speed descriptor, then optional high speed descriptor. */ static ssize_t ep_config (struct ep_data *data, const char *buf, size_t len) { struct usb_ep *ep; u32 tag; int value, length = len; if (data->state != STATE_EP_READY) { value = -EL2HLT; goto fail; } value = len; if (len < USB_DT_ENDPOINT_SIZE + 4) goto fail0; /* we might need to change message format someday */ memcpy(&tag, buf, 4); if (tag != 1) { DBG(data->dev, "config %s, bad tag %d\n", data->name, tag); goto fail0; } buf += 4; len -= 4; /* NOTE: audio endpoint extensions not accepted here; * just don't include the extra bytes. */ /* full/low speed descriptor, then high speed */ memcpy(&data->desc, buf, USB_DT_ENDPOINT_SIZE); if (data->desc.bLength != USB_DT_ENDPOINT_SIZE || data->desc.bDescriptorType != USB_DT_ENDPOINT) goto fail0; if (len != USB_DT_ENDPOINT_SIZE) { if (len != 2 * USB_DT_ENDPOINT_SIZE) goto fail0; memcpy(&data->hs_desc, buf + USB_DT_ENDPOINT_SIZE, USB_DT_ENDPOINT_SIZE); if (data->hs_desc.bLength != USB_DT_ENDPOINT_SIZE || data->hs_desc.bDescriptorType != USB_DT_ENDPOINT) { DBG(data->dev, "config %s, bad hs length or type\n", data->name); goto fail0; } } spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) { value = -ENOENT; goto gone; } else { ep = data->ep; if (ep == NULL) { value = -ENODEV; goto gone; } } switch (data->dev->gadget->speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: ep->desc = &data->desc; break; case USB_SPEED_HIGH: /* fails if caller didn't provide that descriptor... */ ep->desc = &data->hs_desc; break; default: DBG(data->dev, "unconnected, %s init abandoned\n", data->name); value = -EINVAL; goto gone; } value = usb_ep_enable(ep); if (value == 0) { data->state = STATE_EP_ENABLED; value = length; } gone: spin_unlock_irq (&data->dev->lock); if (value < 0) { fail: data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; } return value; fail0: value = -EINVAL; goto fail; } static int ep_open (struct inode *inode, struct file *fd) { struct ep_data *data = inode->i_private; int value = -EBUSY; if (mutex_lock_interruptible(&data->lock) != 0) return -EINTR; spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) value = -ENOENT; else if (data->state == STATE_EP_DISABLED) { value = 0; data->state = STATE_EP_READY; get_ep (data); fd->private_data = data; VDEBUG (data->dev, "%s ready\n", data->name); } else DBG (data->dev, "%s state %d\n", data->name, data->state); spin_unlock_irq (&data->dev->lock); mutex_unlock(&data->lock); return value; } /*----------------------------------------------------------------------*/ /* EP0 IMPLEMENTATION can be partly in userspace. * * Drivers that use this facility receive various events, including * control requests the kernel doesn't handle. Drivers that don't * use this facility may be too simple-minded for real applications. */ static inline void ep0_readable (struct dev_data *dev) { wake_up (&dev->wait); kill_fasync (&dev->fasync, SIGIO, POLL_IN); } static void clean_req (struct usb_ep *ep, struct usb_request *req) { struct dev_data *dev = ep->driver_data; if (req->buf != dev->rbuf) { kfree(req->buf); req->buf = dev->rbuf; } req->complete = epio_complete; dev->setup_out_ready = 0; } static void ep0_complete (struct usb_ep *ep, struct usb_request *req) { struct dev_data *dev = ep->driver_data; unsigned long flags; int free = 1; /* for control OUT, data must still get to userspace */ spin_lock_irqsave(&dev->lock, flags); if (!dev->setup_in) { dev->setup_out_error = (req->status != 0); if (!dev->setup_out_error) free = 0; dev->setup_out_ready = 1; ep0_readable (dev); } /* clean up as appropriate */ if (free && req->buf != &dev->rbuf) clean_req (ep, req); req->complete = epio_complete; spin_unlock_irqrestore(&dev->lock, flags); } static int setup_req (struct usb_ep *ep, struct usb_request *req, u16 len) { struct dev_data *dev = ep->driver_data; if (dev->setup_out_ready) { DBG (dev, "ep0 request busy!\n"); return -EBUSY; } if (len > sizeof (dev->rbuf)) req->buf = kmalloc(len, GFP_ATOMIC); if (req->buf == NULL) { req->buf = dev->rbuf; return -ENOMEM; } req->complete = ep0_complete; req->length = len; req->zero = 0; return 0; } static ssize_t ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t retval; enum ep0_state state; spin_lock_irq (&dev->lock); if (dev->state <= STATE_DEV_OPENED) { retval = -EINVAL; goto done; } /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; retval = -EIDRM; goto done; } /* control DATA stage */ if ((state = dev->state) == STATE_DEV_SETUP) { if (dev->setup_in) { /* stall IN */ VDEBUG(dev, "ep0in stall\n"); (void) usb_ep_set_halt (dev->gadget->ep0); retval = -EL2HLT; dev->state = STATE_DEV_CONNECTED; } else if (len == 0) { /* ack SET_CONFIGURATION etc */ struct usb_ep *ep = dev->gadget->ep0; struct usb_request *req = dev->req; if ((retval = setup_req (ep, req, 0)) == 0) { ++dev->udc_usage; spin_unlock_irq (&dev->lock); retval = usb_ep_queue (ep, req, GFP_KERNEL); spin_lock_irq (&dev->lock); --dev->udc_usage; } dev->state = STATE_DEV_CONNECTED; /* assume that was SET_CONFIGURATION */ if (dev->current_config) { unsigned power; if (gadget_is_dualspeed(dev->gadget) && (dev->gadget->speed == USB_SPEED_HIGH)) power = dev->hs_config->bMaxPower; else power = dev->config->bMaxPower; usb_gadget_vbus_draw(dev->gadget, 2 * power); } } else { /* collect OUT data */ if ((fd->f_flags & O_NONBLOCK) != 0 && !dev->setup_out_ready) { retval = -EAGAIN; goto done; } spin_unlock_irq (&dev->lock); retval = wait_event_interruptible (dev->wait, dev->setup_out_ready != 0); /* FIXME state could change from under us */ spin_lock_irq (&dev->lock); if (retval) goto done; if (dev->state != STATE_DEV_SETUP) { retval = -ECANCELED; goto done; } dev->state = STATE_DEV_CONNECTED; if (dev->setup_out_error) retval = -EIO; else { len = min (len, (size_t)dev->req->actual); ++dev->udc_usage; spin_unlock_irq(&dev->lock); if (copy_to_user (buf, dev->req->buf, len)) retval = -EFAULT; else retval = len; spin_lock_irq(&dev->lock); --dev->udc_usage; clean_req (dev->gadget->ep0, dev->req); /* NOTE userspace can't yet choose to stall */ } } goto done; } /* else normal: return event data */ if (len < sizeof dev->event [0]) { retval = -EINVAL; goto done; } len -= len % sizeof (struct usb_gadgetfs_event); dev->usermode_setup = 1; scan: /* return queued events right away */ if (dev->ev_next != 0) { unsigned i, n; n = len / sizeof (struct usb_gadgetfs_event); if (dev->ev_next < n) n = dev->ev_next; /* ep0 i/o has special semantics during STATE_DEV_SETUP */ for (i = 0; i < n; i++) { if (dev->event [i].type == GADGETFS_SETUP) { dev->state = STATE_DEV_SETUP; n = i + 1; break; } } spin_unlock_irq (&dev->lock); len = n * sizeof (struct usb_gadgetfs_event); if (copy_to_user (buf, &dev->event, len)) retval = -EFAULT; else retval = len; if (len > 0) { /* NOTE this doesn't guard against broken drivers; * concurrent ep0 readers may lose events. */ spin_lock_irq (&dev->lock); if (dev->ev_next > n) { memmove(&dev->event[0], &dev->event[n], sizeof (struct usb_gadgetfs_event) * (dev->ev_next - n)); } dev->ev_next -= n; spin_unlock_irq (&dev->lock); } return retval; } if (fd->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto done; } switch (state) { default: DBG (dev, "fail %s, state %d\n", __func__, state); retval = -ESRCH; break; case STATE_DEV_UNCONNECTED: case STATE_DEV_CONNECTED: spin_unlock_irq (&dev->lock); DBG (dev, "%s wait\n", __func__); /* wait for events */ retval = wait_event_interruptible (dev->wait, dev->ev_next != 0); if (retval < 0) return retval; spin_lock_irq (&dev->lock); goto scan; } done: spin_unlock_irq (&dev->lock); return retval; } static struct usb_gadgetfs_event * next_event (struct dev_data *dev, enum usb_gadgetfs_event_type type) { struct usb_gadgetfs_event *event; unsigned i; switch (type) { /* these events purge the queue */ case GADGETFS_DISCONNECT: if (dev->state == STATE_DEV_SETUP) dev->setup_abort = 1; // FALL THROUGH case GADGETFS_CONNECT: dev->ev_next = 0; break; case GADGETFS_SETUP: /* previous request timed out */ case GADGETFS_SUSPEND: /* same effect */ /* these events can't be repeated */ for (i = 0; i != dev->ev_next; i++) { if (dev->event [i].type != type) continue; DBG(dev, "discard old event[%d] %d\n", i, type); dev->ev_next--; if (i == dev->ev_next) break; /* indices start at zero, for simplicity */ memmove (&dev->event [i], &dev->event [i + 1], sizeof (struct usb_gadgetfs_event) * (dev->ev_next - i)); } break; default: BUG (); } VDEBUG(dev, "event[%d] = %d\n", dev->ev_next, type); event = &dev->event [dev->ev_next++]; BUG_ON (dev->ev_next > N_EVENT); memset (event, 0, sizeof *event); event->type = type; return event; } static ssize_t ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t retval = -ESRCH; /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; retval = -EIDRM; /* data and/or status stage for control request */ } else if (dev->state == STATE_DEV_SETUP) { len = min_t(size_t, len, dev->setup_wLength); if (dev->setup_in) { retval = setup_req (dev->gadget->ep0, dev->req, len); if (retval == 0) { dev->state = STATE_DEV_CONNECTED; ++dev->udc_usage; spin_unlock_irq (&dev->lock); if (copy_from_user (dev->req->buf, buf, len)) retval = -EFAULT; else { if (len < dev->setup_wLength) dev->req->zero = 1; retval = usb_ep_queue ( dev->gadget->ep0, dev->req, GFP_KERNEL); } spin_lock_irq(&dev->lock); --dev->udc_usage; if (retval < 0) { clean_req (dev->gadget->ep0, dev->req); } else retval = len; return retval; } /* can stall some OUT transfers */ } else if (dev->setup_can_stall) { VDEBUG(dev, "ep0out stall\n"); (void) usb_ep_set_halt (dev->gadget->ep0); retval = -EL2HLT; dev->state = STATE_DEV_CONNECTED; } else { DBG(dev, "bogus ep0out stall!\n"); } } else DBG (dev, "fail %s, state %d\n", __func__, dev->state); return retval; } static int ep0_fasync (int f, struct file *fd, int on) { struct dev_data *dev = fd->private_data; // caller must F_SETOWN before signal delivery happens VDEBUG (dev, "%s %s\n", __func__, on ? "on" : "off"); return fasync_helper (f, fd, on, &dev->fasync); } static struct usb_gadget_driver gadgetfs_driver; static int dev_release (struct inode *inode, struct file *fd) { struct dev_data *dev = fd->private_data; /* closing ep0 === shutdown all */ if (dev->gadget_registered) { usb_gadget_unregister_driver (&gadgetfs_driver); dev->gadget_registered = false; } /* at this point "good" hardware has disconnected the * device from USB; the host won't see it any more. * alternatively, all host requests will time out. */ kfree (dev->buf); dev->buf = NULL; /* other endpoints were all decoupled from this device */ spin_lock_irq(&dev->lock); dev->state = STATE_DEV_DISABLED; spin_unlock_irq(&dev->lock); put_dev (dev); return 0; } static unsigned int ep0_poll (struct file *fd, poll_table *wait) { struct dev_data *dev = fd->private_data; int mask = 0; if (dev->state <= STATE_DEV_OPENED) return DEFAULT_POLLMASK; poll_wait(fd, &dev->wait, wait); spin_lock_irq (&dev->lock); /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; mask = POLLHUP; goto out; } if (dev->state == STATE_DEV_SETUP) { if (dev->setup_in || dev->setup_can_stall) mask = POLLOUT; } else { if (dev->ev_next != 0) mask = POLLIN; } out: spin_unlock_irq(&dev->lock); return mask; } static long dev_ioctl (struct file *fd, unsigned code, unsigned long value) { struct dev_data *dev = fd->private_data; struct usb_gadget *gadget = dev->gadget; long ret = -ENOTTY; spin_lock_irq(&dev->lock); if (dev->state == STATE_DEV_OPENED || dev->state == STATE_DEV_UNBOUND) { /* Not bound to a UDC */ } else if (gadget->ops->ioctl) { ++dev->udc_usage; spin_unlock_irq(&dev->lock); ret = gadget->ops->ioctl (gadget, code, value); spin_lock_irq(&dev->lock); --dev->udc_usage; } spin_unlock_irq(&dev->lock); return ret; } /*----------------------------------------------------------------------*/ /* The in-kernel gadget driver handles most ep0 issues, in particular * enumerating the single configuration (as provided from user space). * * Unrecognized ep0 requests may be handled in user space. */ static void make_qualifier (struct dev_data *dev) { struct usb_qualifier_descriptor qual; struct usb_device_descriptor *desc; qual.bLength = sizeof qual; qual.bDescriptorType = USB_DT_DEVICE_QUALIFIER; qual.bcdUSB = cpu_to_le16 (0x0200); desc = dev->dev; qual.bDeviceClass = desc->bDeviceClass; qual.bDeviceSubClass = desc->bDeviceSubClass; qual.bDeviceProtocol = desc->bDeviceProtocol; /* assumes ep0 uses the same value for both speeds ... */ qual.bMaxPacketSize0 = dev->gadget->ep0->maxpacket; qual.bNumConfigurations = 1; qual.bRESERVED = 0; memcpy (dev->rbuf, &qual, sizeof qual); } static int config_buf (struct dev_data *dev, u8 type, unsigned index) { int len; int hs = 0; /* only one configuration */ if (index > 0) return -EINVAL; if (gadget_is_dualspeed(dev->gadget)) { hs = (dev->gadget->speed == USB_SPEED_HIGH); if (type == USB_DT_OTHER_SPEED_CONFIG) hs = !hs; } if (hs) { dev->req->buf = dev->hs_config; len = le16_to_cpu(dev->hs_config->wTotalLength); } else { dev->req->buf = dev->config; len = le16_to_cpu(dev->config->wTotalLength); } ((u8 *)dev->req->buf) [1] = type; return len; } static int gadgetfs_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) { struct dev_data *dev = get_gadget_data (gadget); struct usb_request *req = dev->req; int value = -EOPNOTSUPP; struct usb_gadgetfs_event *event; u16 w_value = le16_to_cpu(ctrl->wValue); u16 w_length = le16_to_cpu(ctrl->wLength); if (w_length > RBUF_SIZE) { if (ctrl->bRequestType & USB_DIR_IN) { /* Cast away the const, we are going to overwrite on purpose. */ __le16 *temp = (__le16 *)&ctrl->wLength; *temp = cpu_to_le16(RBUF_SIZE); w_length = RBUF_SIZE; } else { return value; } } spin_lock (&dev->lock); dev->setup_abort = 0; if (dev->state == STATE_DEV_UNCONNECTED) { if (gadget_is_dualspeed(gadget) && gadget->speed == USB_SPEED_HIGH && dev->hs_config == NULL) { spin_unlock(&dev->lock); ERROR (dev, "no high speed config??\n"); return -EINVAL; } dev->state = STATE_DEV_CONNECTED; INFO (dev, "connected\n"); event = next_event (dev, GADGETFS_CONNECT); event->u.speed = gadget->speed; ep0_readable (dev); /* host may have given up waiting for response. we can miss control * requests handled lower down (device/endpoint status and features); * then ep0_{read,write} will report the wrong status. controller * driver will have aborted pending i/o. */ } else if (dev->state == STATE_DEV_SETUP) dev->setup_abort = 1; req->buf = dev->rbuf; req->context = NULL; switch (ctrl->bRequest) { case USB_REQ_GET_DESCRIPTOR: if (ctrl->bRequestType != USB_DIR_IN) goto unrecognized; switch (w_value >> 8) { case USB_DT_DEVICE: value = min (w_length, (u16) sizeof *dev->dev); dev->dev->bMaxPacketSize0 = dev->gadget->ep0->maxpacket; req->buf = dev->dev; break; case USB_DT_DEVICE_QUALIFIER: if (!dev->hs_config) break; value = min (w_length, (u16) sizeof (struct usb_qualifier_descriptor)); make_qualifier (dev); break; case USB_DT_OTHER_SPEED_CONFIG: // FALLTHROUGH case USB_DT_CONFIG: value = config_buf (dev, w_value >> 8, w_value & 0xff); if (value >= 0) value = min (w_length, (u16) value); break; case USB_DT_STRING: goto unrecognized; default: // all others are errors break; } break; /* currently one config, two speeds */ case USB_REQ_SET_CONFIGURATION: if (ctrl->bRequestType != 0) goto unrecognized; if (0 == (u8) w_value) { value = 0; dev->current_config = 0; usb_gadget_vbus_draw(gadget, 8 /* mA */ ); // user mode expected to disable endpoints } else { u8 config, power; if (gadget_is_dualspeed(gadget) && gadget->speed == USB_SPEED_HIGH) { config = dev->hs_config->bConfigurationValue; power = dev->hs_config->bMaxPower; } else { config = dev->config->bConfigurationValue; power = dev->config->bMaxPower; } if (config == (u8) w_value) { value = 0; dev->current_config = config; usb_gadget_vbus_draw(gadget, 2 * power); } } /* report SET_CONFIGURATION like any other control request, * except that usermode may not stall this. the next * request mustn't be allowed start until this finishes: * endpoints and threads set up, etc. * * NOTE: older PXA hardware (before PXA 255: without UDCCFR) * has bad/racey automagic that prevents synchronizing here. * even kernel mode drivers often miss them. */ if (value == 0) { INFO (dev, "configuration #%d\n", dev->current_config); usb_gadget_set_state(gadget, USB_STATE_CONFIGURED); if (dev->usermode_setup) { dev->setup_can_stall = 0; goto delegate; } } break; #ifndef CONFIG_USB_PXA25X /* PXA automagically handles this request too */ case USB_REQ_GET_CONFIGURATION: if (ctrl->bRequestType != 0x80) goto unrecognized; *(u8 *)req->buf = dev->current_config; value = min (w_length, (u16) 1); break; #endif default: unrecognized: VDEBUG (dev, "%s req%02x.%02x v%04x i%04x l%d\n", dev->usermode_setup ? "delegate" : "fail", ctrl->bRequestType, ctrl->bRequest, w_value, le16_to_cpu(ctrl->wIndex), w_length); /* if there's an ep0 reader, don't stall */ if (dev->usermode_setup) { dev->setup_can_stall = 1; delegate: dev->setup_in = (ctrl->bRequestType & USB_DIR_IN) ? 1 : 0; dev->setup_wLength = w_length; dev->setup_out_ready = 0; dev->setup_out_error = 0; value = 0; /* read DATA stage for OUT right away */ if (unlikely (!dev->setup_in && w_length)) { value = setup_req (gadget->ep0, dev->req, w_length); if (value < 0) break; ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, dev->req, GFP_KERNEL); spin_lock (&dev->lock); --dev->udc_usage; if (value < 0) { clean_req (gadget->ep0, dev->req); break; } /* we can't currently stall these */ dev->setup_can_stall = 0; } /* state changes when reader collects event */ event = next_event (dev, GADGETFS_SETUP); event->u.setup = *ctrl; ep0_readable (dev); spin_unlock (&dev->lock); return 0; } } /* proceed with data transfer and status phases? */ if (value >= 0 && dev->state != STATE_DEV_SETUP) { req->length = value; req->zero = value < w_length; ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, req, GFP_KERNEL); spin_lock(&dev->lock); --dev->udc_usage; spin_unlock(&dev->lock); if (value < 0) { DBG (dev, "ep_queue --> %d\n", value); req->status = 0; } return value; } /* device stalls when value < 0 */ spin_unlock (&dev->lock); return value; } static void destroy_ep_files (struct dev_data *dev) { DBG (dev, "%s %d\n", __func__, dev->state); /* dev->state must prevent interference */ spin_lock_irq (&dev->lock); while (!list_empty(&dev->epfiles)) { struct ep_data *ep; struct inode *parent; struct dentry *dentry; /* break link to FS */ ep = list_first_entry (&dev->epfiles, struct ep_data, epfiles); list_del_init (&ep->epfiles); spin_unlock_irq (&dev->lock); dentry = ep->dentry; ep->dentry = NULL; parent = d_inode(dentry->d_parent); /* break link to controller */ mutex_lock(&ep->lock); if (ep->state == STATE_EP_ENABLED) (void) usb_ep_disable (ep->ep); ep->state = STATE_EP_UNBOUND; usb_ep_free_request (ep->ep, ep->req); ep->ep = NULL; mutex_unlock(&ep->lock); wake_up (&ep->wait); put_ep (ep); /* break link to dcache */ inode_lock(parent); d_delete (dentry); dput (dentry); inode_unlock(parent); spin_lock_irq (&dev->lock); } spin_unlock_irq (&dev->lock); } static struct dentry * gadgetfs_create_file (struct super_block *sb, char const *name, void *data, const struct file_operations *fops); static int activate_ep_files (struct dev_data *dev) { struct usb_ep *ep; struct ep_data *data; gadget_for_each_ep (ep, dev->gadget) { data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto enomem0; data->state = STATE_EP_DISABLED; mutex_init(&data->lock); init_waitqueue_head (&data->wait); strncpy (data->name, ep->name, sizeof (data->name) - 1); refcount_set (&data->count, 1); data->dev = dev; get_dev (dev); data->ep = ep; ep->driver_data = data; data->req = usb_ep_alloc_request (ep, GFP_KERNEL); if (!data->req) goto enomem1; data->dentry = gadgetfs_create_file (dev->sb, data->name, data, &ep_io_operations); if (!data->dentry) goto enomem2; list_add_tail (&data->epfiles, &dev->epfiles); } return 0; enomem2: usb_ep_free_request (ep, data->req); enomem1: put_dev (dev); kfree (data); enomem0: DBG (dev, "%s enomem\n", __func__); destroy_ep_files (dev); return -ENOMEM; } static void gadgetfs_unbind (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); DBG (dev, "%s\n", __func__); spin_lock_irq (&dev->lock); dev->state = STATE_DEV_UNBOUND; while (dev->udc_usage > 0) { spin_unlock_irq(&dev->lock); usleep_range(1000, 2000); spin_lock_irq(&dev->lock); } spin_unlock_irq (&dev->lock); destroy_ep_files (dev); gadget->ep0->driver_data = NULL; set_gadget_data (gadget, NULL); /* we've already been disconnected ... no i/o is active */ if (dev->req) usb_ep_free_request (gadget->ep0, dev->req); DBG (dev, "%s done\n", __func__); put_dev (dev); } static struct dev_data *the_device; static int gadgetfs_bind(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { struct dev_data *dev = the_device; if (!dev) return -ESRCH; if (0 != strcmp (CHIP, gadget->name)) { pr_err("%s expected %s controller not %s\n", shortname, CHIP, gadget->name); return -ENODEV; } set_gadget_data (gadget, dev); dev->gadget = gadget; gadget->ep0->driver_data = dev; /* preallocate control response and buffer */ dev->req = usb_ep_alloc_request (gadget->ep0, GFP_KERNEL); if (!dev->req) goto enomem; dev->req->context = NULL; dev->req->complete = epio_complete; if (activate_ep_files (dev) < 0) goto enomem; INFO (dev, "bound to %s driver\n", gadget->name); spin_lock_irq(&dev->lock); dev->state = STATE_DEV_UNCONNECTED; spin_unlock_irq(&dev->lock); get_dev (dev); return 0; enomem: gadgetfs_unbind (gadget); return -ENOMEM; } static void gadgetfs_disconnect (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); unsigned long flags; spin_lock_irqsave (&dev->lock, flags); if (dev->state == STATE_DEV_UNCONNECTED) goto exit; dev->state = STATE_DEV_UNCONNECTED; INFO (dev, "disconnected\n"); next_event (dev, GADGETFS_DISCONNECT); ep0_readable (dev); exit: spin_unlock_irqrestore (&dev->lock, flags); } static void gadgetfs_suspend (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); unsigned long flags; INFO (dev, "suspended from state %d\n", dev->state); spin_lock_irqsave(&dev->lock, flags); switch (dev->state) { case STATE_DEV_SETUP: // VERY odd... host died?? case STATE_DEV_CONNECTED: case STATE_DEV_UNCONNECTED: next_event (dev, GADGETFS_SUSPEND); ep0_readable (dev); /* FALLTHROUGH */ default: break; } spin_unlock_irqrestore(&dev->lock, flags); } static struct usb_gadget_driver gadgetfs_driver = { .function = (char *) driver_desc, .bind = gadgetfs_bind, .unbind = gadgetfs_unbind, .setup = gadgetfs_setup, .reset = gadgetfs_disconnect, .disconnect = gadgetfs_disconnect, .suspend = gadgetfs_suspend, .driver = { .name = (char *) shortname, }, }; /*----------------------------------------------------------------------*/ /* DEVICE INITIALIZATION * * fd = open ("/dev/gadget/$CHIP", O_RDWR) * status = write (fd, descriptors, sizeof descriptors) * * That write establishes the device configuration, so the kernel can * bind to the controller ... guaranteeing it can handle enumeration * at all necessary speeds. Descriptor order is: * * . message tag (u32, host order) ... for now, must be zero; it * would change to support features like multi-config devices * . full/low speed config ... all wTotalLength bytes (with interface, * class, altsetting, endpoint, and other descriptors) * . high speed config ... all descriptors, for high speed operation; * this one's optional except for high-speed hardware * . device descriptor * * Endpoints are not yet enabled. Drivers must wait until device * configuration and interface altsetting changes create * the need to configure (or unconfigure) them. * * After initialization, the device stays active for as long as that * $CHIP file is open. Events must then be read from that descriptor, * such as configuration notifications. */ static int is_valid_config(struct usb_config_descriptor *config, unsigned int total) { return config->bDescriptorType == USB_DT_CONFIG && config->bLength == USB_DT_CONFIG_SIZE && total >= USB_DT_CONFIG_SIZE && config->bConfigurationValue != 0 && (config->bmAttributes & USB_CONFIG_ATT_ONE) != 0 && (config->bmAttributes & USB_CONFIG_ATT_WAKEUP) == 0; /* FIXME if gadget->is_otg, _must_ include an otg descriptor */ /* FIXME check lengths: walk to end */ } static ssize_t dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t value, length = len; unsigned total; u32 tag; char *kbuf; spin_lock_irq(&dev->lock); if (dev->state > STATE_DEV_OPENED) { value = ep0_write(fd, buf, len, ptr); spin_unlock_irq(&dev->lock); return value; } spin_unlock_irq(&dev->lock); if ((len < (USB_DT_CONFIG_SIZE + USB_DT_DEVICE_SIZE + 4)) || (len > PAGE_SIZE * 4)) return -EINVAL; /* we might need to change message format someday */ if (copy_from_user (&tag, buf, 4)) return -EFAULT; if (tag != 0) return -EINVAL; buf += 4; length -= 4; kbuf = memdup_user(buf, length); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); spin_lock_irq (&dev->lock); value = -EINVAL; if (dev->buf) { spin_unlock_irq(&dev->lock); kfree(kbuf); return value; } dev->buf = kbuf; /* full or low speed config */ dev->config = (void *) kbuf; total = le16_to_cpu(dev->config->wTotalLength); if (!is_valid_config(dev->config, total) || total > length - USB_DT_DEVICE_SIZE) goto fail; kbuf += total; length -= total; /* optional high speed config */ if (kbuf [1] == USB_DT_CONFIG) { dev->hs_config = (void *) kbuf; total = le16_to_cpu(dev->hs_config->wTotalLength); if (!is_valid_config(dev->hs_config, total) || total > length - USB_DT_DEVICE_SIZE) goto fail; kbuf += total; length -= total; } else { dev->hs_config = NULL; } /* could support multiple configs, using another encoding! */ /* device descriptor (tweaked for paranoia) */ if (length != USB_DT_DEVICE_SIZE) goto fail; dev->dev = (void *)kbuf; if (dev->dev->bLength != USB_DT_DEVICE_SIZE || dev->dev->bDescriptorType != USB_DT_DEVICE || dev->dev->bNumConfigurations != 1) goto fail; dev->dev->bcdUSB = cpu_to_le16 (0x0200); /* triggers gadgetfs_bind(); then we can enumerate. */ spin_unlock_irq (&dev->lock); if (dev->hs_config) gadgetfs_driver.max_speed = USB_SPEED_HIGH; else gadgetfs_driver.max_speed = USB_SPEED_FULL; value = usb_gadget_probe_driver(&gadgetfs_driver); if (value != 0) { spin_lock_irq(&dev->lock); goto fail; } else { /* at this point "good" hardware has for the first time * let the USB the host see us. alternatively, if users * unplug/replug that will clear all the error state. * * note: everything running before here was guaranteed * to choke driver model style diagnostics. from here * on, they can work ... except in cleanup paths that * kick in after the ep0 descriptor is closed. */ value = len; dev->gadget_registered = true; } return value; fail: dev->config = NULL; dev->hs_config = NULL; dev->dev = NULL; spin_unlock_irq (&dev->lock); pr_debug ("%s: %s fail %zd, %p\n", shortname, __func__, value, dev); kfree (dev->buf); dev->buf = NULL; return value; } static int dev_open (struct inode *inode, struct file *fd) { struct dev_data *dev = inode->i_private; int value = -EBUSY; spin_lock_irq(&dev->lock); if (dev->state == STATE_DEV_DISABLED) { dev->ev_next = 0; dev->state = STATE_DEV_OPENED; fd->private_data = dev; get_dev (dev); value = 0; } spin_unlock_irq(&dev->lock); return value; } static const struct file_operations ep0_operations = { .llseek = no_llseek, .open = dev_open, .read = ep0_read, .write = dev_config, .fasync = ep0_fasync, .poll = ep0_poll, .unlocked_ioctl = dev_ioctl, .release = dev_release, }; /*----------------------------------------------------------------------*/ /* FILESYSTEM AND SUPERBLOCK OPERATIONS * * Mounting the filesystem creates a controller file, used first for * device configuration then later for event monitoring. */ /* FIXME PAM etc could set this security policy without mount options * if epfiles inherited ownership and permissons from ep0 ... */ static unsigned default_uid; static unsigned default_gid; static unsigned default_perm = S_IRUSR | S_IWUSR; module_param (default_uid, uint, 0644); module_param (default_gid, uint, 0644); module_param (default_perm, uint, 0644); static struct inode * gadgetfs_make_inode (struct super_block *sb, void *data, const struct file_operations *fops, int mode) { struct inode *inode = new_inode (sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, default_uid); inode->i_gid = make_kgid(&init_user_ns, default_gid); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_private = data; inode->i_fop = fops; } return inode; } /* creates in fs root directory, so non-renamable and non-linkable. * so inode and dentry are paired, until device reconfig. */ static struct dentry * gadgetfs_create_file (struct super_block *sb, char const *name, void *data, const struct file_operations *fops) { struct dentry *dentry; struct inode *inode; dentry = d_alloc_name(sb->s_root, name); if (!dentry) return NULL; inode = gadgetfs_make_inode (sb, data, fops, S_IFREG | (default_perm & S_IRWXUGO)); if (!inode) { dput(dentry); return NULL; } d_add (dentry, inode); return dentry; } static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; static int gadgetfs_fill_super (struct super_block *sb, void *opts, int silent) { struct inode *inode; struct dev_data *dev; if (the_device) return -ESRCH; CHIP = usb_get_gadget_udc_name(); if (!CHIP) return -ENODEV; /* superblock */ sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = GADGETFS_MAGIC; sb->s_op = &gadget_fs_operations; sb->s_time_gran = 1; /* root inode */ inode = gadgetfs_make_inode (sb, NULL, &simple_dir_operations, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) goto Enomem; inode->i_op = &simple_dir_inode_operations; if (!(sb->s_root = d_make_root (inode))) goto Enomem; /* the ep0 file is named after the controller we expect; * user mode code can use it for sanity checks, like we do. */ dev = dev_new (); if (!dev) goto Enomem; dev->sb = sb; dev->dentry = gadgetfs_create_file(sb, CHIP, dev, &ep0_operations); if (!dev->dentry) { put_dev(dev); goto Enomem; } /* other endpoint files are available after hardware setup, * from binding to a controller. */ the_device = dev; return 0; Enomem: kfree(CHIP); CHIP = NULL; return -ENOMEM; } /* "mount -t gadgetfs path /dev/gadget" ends up here */ static struct dentry * gadgetfs_mount (struct file_system_type *t, int flags, const char *path, void *opts) { return mount_single (t, flags, opts, gadgetfs_fill_super); } static void gadgetfs_kill_sb (struct super_block *sb) { kill_litter_super (sb); if (the_device) { put_dev (the_device); the_device = NULL; } kfree(CHIP); CHIP = NULL; } /*----------------------------------------------------------------------*/ static struct file_system_type gadgetfs_type = { .owner = THIS_MODULE, .name = shortname, .mount = gadgetfs_mount, .kill_sb = gadgetfs_kill_sb, }; MODULE_ALIAS_FS("gadgetfs"); /*----------------------------------------------------------------------*/ static int __init init (void) { int status; status = register_filesystem (&gadgetfs_type); if (status == 0) pr_info ("%s: %s, version " DRIVER_VERSION "\n", shortname, driver_desc); return status; } module_init (init); static void __exit cleanup (void) { pr_debug ("unregister %s\n", shortname); unregister_filesystem (&gadgetfs_type); } module_exit (cleanup);
1 1 1 1 5 1 5 5 6 6 1 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 /* * TTL modification target for IP tables * (C) 2000,2005 by Harald Welte <laforge@netfilter.org> * * Hop Limit modification target for ip6tables * Maciej Soltysiak <solt@dns.toxicfilms.tv> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/checksum.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ipt_TTL.h> #include <linux/netfilter_ipv6/ip6t_HL.h> MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_AUTHOR("Maciej Soltysiak <solt@dns.toxicfilms.tv>"); MODULE_DESCRIPTION("Xtables: Hoplimit/TTL Limit field modification target"); MODULE_LICENSE("GPL"); static unsigned int ttl_tg(struct sk_buff *skb, const struct xt_action_param *par) { struct iphdr *iph; const struct ipt_TTL_info *info = par->targinfo; int new_ttl; if (!skb_make_writable(skb, skb->len)) return NF_DROP; iph = ip_hdr(skb); switch (info->mode) { case IPT_TTL_SET: new_ttl = info->ttl; break; case IPT_TTL_INC: new_ttl = iph->ttl + info->ttl; if (new_ttl > 255) new_ttl = 255; break; case IPT_TTL_DEC: new_ttl = iph->ttl - info->ttl; if (new_ttl < 0) new_ttl = 0; break; default: new_ttl = iph->ttl; break; } if (new_ttl != iph->ttl) { csum_replace2(&iph->check, htons(iph->ttl << 8), htons(new_ttl << 8)); iph->ttl = new_ttl; } return XT_CONTINUE; } static unsigned int hl_tg6(struct sk_buff *skb, const struct xt_action_param *par) { struct ipv6hdr *ip6h; const struct ip6t_HL_info *info = par->targinfo; int new_hl; if (!skb_make_writable(skb, skb->len)) return NF_DROP; ip6h = ipv6_hdr(skb); switch (info->mode) { case IP6T_HL_SET: new_hl = info->hop_limit; break; case IP6T_HL_INC: new_hl = ip6h->hop_limit + info->hop_limit; if (new_hl > 255) new_hl = 255; break; case IP6T_HL_DEC: new_hl = ip6h->hop_limit - info->hop_limit; if (new_hl < 0) new_hl = 0; break; default: new_hl = ip6h->hop_limit; break; } ip6h->hop_limit = new_hl; return XT_CONTINUE; } static int ttl_tg_check(const struct xt_tgchk_param *par) { const struct ipt_TTL_info *info = par->targinfo; if (info->mode > IPT_TTL_MAXMODE) { pr_info("TTL: invalid or unknown mode %u\n", info->mode); return -EINVAL; } if (info->mode != IPT_TTL_SET && info->ttl == 0) return -EINVAL; return 0; } static int hl_tg6_check(const struct xt_tgchk_param *par) { const struct ip6t_HL_info *info = par->targinfo; if (info->mode > IP6T_HL_MAXMODE) { pr_info("invalid or unknown mode %u\n", info->mode); return -EINVAL; } if (info->mode != IP6T_HL_SET && info->hop_limit == 0) { pr_info("increment/decrement does not " "make sense with value 0\n"); return -EINVAL; } return 0; } static struct xt_target hl_tg_reg[] __read_mostly = { { .name = "TTL", .revision = 0, .family = NFPROTO_IPV4, .target = ttl_tg, .targetsize = sizeof(struct ipt_TTL_info), .table = "mangle", .checkentry = ttl_tg_check, .me = THIS_MODULE, }, { .name = "HL", .revision = 0, .family = NFPROTO_IPV6, .target = hl_tg6, .targetsize = sizeof(struct ip6t_HL_info), .table = "mangle", .checkentry = hl_tg6_check, .me = THIS_MODULE, }, }; static int __init hl_tg_init(void) { return xt_register_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); } static void __exit hl_tg_exit(void) { xt_unregister_targets(hl_tg_reg, ARRAY_SIZE(hl_tg_reg)); } module_init(hl_tg_init); module_exit(hl_tg_exit); MODULE_ALIAS("ipt_TTL"); MODULE_ALIAS("ip6t_HL");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 /* * Linux WiMAX * Collection of tools to manage debug operations. * * * Copyright (C) 2005-2007 Intel Corporation * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com> * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA * 02110-1301, USA. * * * Don't #include this file directly, read on! * * * EXECUTING DEBUGGING ACTIONS OR NOT * * The main thing this framework provides is decission power to take a * debug action (like printing a message) if the current debug level * allows it. * * The decission power is at two levels: at compile-time (what does * not make it is compiled out) and at run-time. The run-time * selection is done per-submodule (as they are declared by the user * of the framework). * * A call to d_test(L) (L being the target debug level) returns true * if the action should be taken because the current debug levels * allow it (both compile and run time). * * It follows that a call to d_test() that can be determined to be * always false at compile time will get the code depending on it * compiled out by optimization. * * * DEBUG LEVELS * * It is up to the caller to define how much a debugging level is. * * Convention sets 0 as "no debug" (so an action marked as debug level 0 * will always be taken). The increasing debug levels are used for * increased verbosity. * * * USAGE * * Group the code in modules and submodules inside each module [which * in most cases maps to Linux modules and .c files that compose * those]. * * * For each module, there is: * * - a MODULENAME (single word, legal C identifier) * * - a debug-levels.h header file that declares the list of * submodules and that is included by all .c files that use * the debugging tools. The file name can be anything. * * - some (optional) .c code to manipulate the runtime debug levels * through debugfs. * * The debug-levels.h file would look like: * * #ifndef __debug_levels__h__ * #define __debug_levels__h__ * * #define D_MODULENAME modulename * #define D_MASTER 10 * * #include <linux/wimax/debug.h> * * enum d_module { * D_SUBMODULE_DECLARE(submodule_1), * D_SUBMODULE_DECLARE(submodule_2), * ... * D_SUBMODULE_DECLARE(submodule_N) * }; * * #endif * * D_MASTER is the maximum compile-time debug level; any debug actions * above this will be out. D_MODULENAME is the module name (legal C * identifier), which has to be unique for each module (to avoid * namespace collisions during linkage). Note those #defines need to * be done before #including debug.h * * We declare N different submodules whose debug level can be * independently controlled during runtime. * * In a .c file of the module (and only in one of them), define the * following code: * * struct d_level D_LEVEL[] = { * D_SUBMODULE_DEFINE(submodule_1), * D_SUBMODULE_DEFINE(submodule_2), * ... * D_SUBMODULE_DEFINE(submodule_N), * }; * size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); * * Externs for d_level_MODULENAME and d_level_size_MODULENAME are used * and declared in this file using the D_LEVEL and D_LEVEL_SIZE macros * #defined also in this file. * * To manipulate from user space the levels, create a debugfs dentry * and then register each submodule with: * * result = d_level_register_debugfs("PREFIX_", submodule_X, parent); * if (result < 0) * goto error; * * Where PREFIX_ is a name of your chosing. This will create debugfs * file with a single numeric value that can be use to tweak it. To * remove the entires, just use debugfs_remove_recursive() on 'parent'. * * NOTE: remember that even if this will show attached to some * particular instance of a device, the settings are *global*. * * * On each submodule (for example, .c files), the debug infrastructure * should be included like this: * * #define D_SUBMODULE submodule_x // matches one in debug-levels.h * #include "debug-levels.h" * * after #including all your include files. * * * Now you can use the d_*() macros below [d_test(), d_fnstart(), * d_fnend(), d_printf(), d_dump()]. * * If their debug level is greater than D_MASTER, they will be * compiled out. * * If their debug level is lower or equal than D_MASTER but greater * than the current debug level of their submodule, they'll be * ignored. * * Otherwise, the action will be performed. */ #ifndef __debug__h__ #define __debug__h__ #include <linux/types.h> #include <linux/slab.h> struct device; /* Backend stuff */ /* * Debug backend: generate a message header from a 'struct device' * * @head: buffer where to place the header * @head_size: length of @head * @dev: pointer to device used to generate a header from. If NULL, * an empty ("") header is generated. */ static inline void __d_head(char *head, size_t head_size, struct device *dev) { if (dev == NULL) head[0] = 0; else if ((unsigned long)dev < 4096) { printk(KERN_ERR "E: Corrupt dev %p\n", dev); WARN_ON(1); } else snprintf(head, head_size, "%s %s: ", dev_driver_string(dev), dev_name(dev)); } /* * Debug backend: log some message if debugging is enabled * * @l: intended debug level * @tag: tag to prefix the message with * @dev: 'struct device' associated to this message * @f: printf-like format and arguments * * Note this is optimized out if it doesn't pass the compile-time * check; however, it is *always* compiled. This is useful to make * sure the printf-like formats and variables are always checked and * they don't get bit rot if you have all the debugging disabled. */ #define _d_printf(l, tag, dev, f, a...) \ do { \ char head[64]; \ if (!d_test(l)) \ break; \ __d_head(head, sizeof(head), dev); \ printk(KERN_ERR "%s%s%s: " f, head, __func__, tag, ##a); \ } while (0) /* * CPP sintatic sugar to generate A_B like symbol names when one of * the arguments is a a preprocessor #define. */ #define __D_PASTE__(varname, modulename) varname##_##modulename #define __D_PASTE(varname, modulename) (__D_PASTE__(varname, modulename)) #define _D_SUBMODULE_INDEX(_name) (D_SUBMODULE_DECLARE(_name)) /* * Store a submodule's runtime debug level and name */ struct d_level { u8 level; const char *name; }; /* * List of available submodules and their debug levels * * We call them d_level_MODULENAME and d_level_size_MODULENAME; the * macros D_LEVEL and D_LEVEL_SIZE contain the name already for * convenience. * * This array and the size are defined on some .c file that is part of * the current module. */ #define D_LEVEL __D_PASTE(d_level, D_MODULENAME) #define D_LEVEL_SIZE __D_PASTE(d_level_size, D_MODULENAME) extern struct d_level D_LEVEL[]; extern size_t D_LEVEL_SIZE; /* * Frontend stuff * * * Stuff you need to declare prior to using the actual "debug" actions * (defined below). */ #ifndef D_MODULENAME #error D_MODULENAME is not defined in your debug-levels.h file /** * D_MODULE - Name of the current module * * #define in your module's debug-levels.h, making sure it is * unique. This has to be a legal C identifier. */ #define D_MODULENAME undefined_modulename #endif #ifndef D_MASTER #warning D_MASTER not defined, but debug.h included! [see docs] /** * D_MASTER - Compile time maximum debug level * * #define in your debug-levels.h file to the maximum debug level the * runtime code will be allowed to have. This allows you to provide a * main knob. * * Anything above that level will be optimized out of the compile. * * Defaults to zero (no debug code compiled in). * * Maximum one definition per module (at the debug-levels.h file). */ #define D_MASTER 0 #endif #ifndef D_SUBMODULE #error D_SUBMODULE not defined, but debug.h included! [see docs] /** * D_SUBMODULE - Name of the current submodule * * #define in your submodule .c file before #including debug-levels.h * to the name of the current submodule as previously declared and * defined with D_SUBMODULE_DECLARE() (in your module's * debug-levels.h) and D_SUBMODULE_DEFINE(). * * This is used to provide runtime-control over the debug levels. * * Maximum one per .c file! Can be shared among different .c files * (meaning they belong to the same submodule categorization). */ #define D_SUBMODULE undefined_module #endif /** * D_SUBMODULE_DECLARE - Declare a submodule for runtime debug level control * * @_name: name of the submodule, restricted to the chars that make up a * valid C identifier ([a-zA-Z0-9_]). * * Declare in the module's debug-levels.h header file as: * * enum d_module { * D_SUBMODULE_DECLARE(submodule_1), * D_SUBMODULE_DECLARE(submodule_2), * D_SUBMODULE_DECLARE(submodule_3), * }; * * Some corresponding .c file needs to have a matching * D_SUBMODULE_DEFINE(). */ #define D_SUBMODULE_DECLARE(_name) __D_SUBMODULE_##_name /** * D_SUBMODULE_DEFINE - Define a submodule for runtime debug level control * * @_name: name of the submodule, restricted to the chars that make up a * valid C identifier ([a-zA-Z0-9_]). * * Use once per module (in some .c file) as: * * static * struct d_level d_level_SUBMODULENAME[] = { * D_SUBMODULE_DEFINE(submodule_1), * D_SUBMODULE_DEFINE(submodule_2), * D_SUBMODULE_DEFINE(submodule_3), * }; * size_t d_level_size_SUBDMODULENAME = ARRAY_SIZE(d_level_SUBDMODULENAME); * * Matching D_SUBMODULE_DECLARE()s have to be present in a * debug-levels.h header file. */ #define D_SUBMODULE_DEFINE(_name) \ [__D_SUBMODULE_##_name] = { \ .level = 0, \ .name = #_name \ } /* The actual "debug" operations */ /** * d_test - Returns true if debugging should be enabled * * @l: intended debug level (unsigned) * * If the master debug switch is enabled and the current settings are * higher or equal to the requested level, then debugging * output/actions should be enabled. * * NOTE: * * This needs to be coded so that it can be evaluated in compile * time; this is why the ugly BUG_ON() is placed in there, so the * D_MASTER evaluation compiles all out if it is compile-time false. */ #define d_test(l) \ ({ \ unsigned __l = l; /* type enforcer */ \ (D_MASTER) >= __l \ && ({ \ BUG_ON(_D_SUBMODULE_INDEX(D_SUBMODULE) >= D_LEVEL_SIZE);\ D_LEVEL[_D_SUBMODULE_INDEX(D_SUBMODULE)].level >= __l; \ }); \ }) /** * d_fnstart - log message at function start if debugging enabled * * @l: intended debug level * @_dev: 'struct device' pointer, NULL if none (for context) * @f: printf-like format and arguments */ #define d_fnstart(l, _dev, f, a...) _d_printf(l, " FNSTART", _dev, f, ## a) /** * d_fnend - log message at function end if debugging enabled * * @l: intended debug level * @_dev: 'struct device' pointer, NULL if none (for context) * @f: printf-like format and arguments */ #define d_fnend(l, _dev, f, a...) _d_printf(l, " FNEND", _dev, f, ## a) /** * d_printf - log message if debugging enabled * * @l: intended debug level * @_dev: 'struct device' pointer, NULL if none (for context) * @f: printf-like format and arguments */ #define d_printf(l, _dev, f, a...) _d_printf(l, "", _dev, f, ## a) /** * d_dump - log buffer hex dump if debugging enabled * * @l: intended debug level * @_dev: 'struct device' pointer, NULL if none (for context) * @f: printf-like format and arguments */ #define d_dump(l, dev, ptr, size) \ do { \ char head[64]; \ if (!d_test(l)) \ break; \ __d_head(head, sizeof(head), dev); \ print_hex_dump(KERN_ERR, head, 0, 16, 1, \ ((void *) ptr), (size), 0); \ } while (0) /** * Export a submodule's debug level over debugfs as PREFIXSUBMODULE * * @prefix: string to prefix the name with * @submodule: name of submodule (not a string, just the name) * @dentry: debugfs parent dentry * * Returns: 0 if ok, < 0 errno on error. * * For removing, just use debugfs_remove_recursive() on the parent. */ #define d_level_register_debugfs(prefix, name, parent) \ ({ \ int rc; \ struct dentry *fd; \ struct dentry *verify_parent_type = parent; \ fd = debugfs_create_u8( \ prefix #name, 0600, verify_parent_type, \ &(D_LEVEL[__D_SUBMODULE_ ## name].level)); \ rc = PTR_ERR(fd); \ if (IS_ERR(fd) && rc != -ENODEV) \ printk(KERN_ERR "%s: Can't create debugfs entry %s: " \ "%d\n", __func__, prefix #name, rc); \ else \ rc = 0; \ rc; \ }) static inline void d_submodule_set(struct d_level *d_level, size_t d_level_size, const char *submodule, u8 level, const char *tag) { struct d_level *itr, *top; int index = -1; for (itr = d_level, top = itr + d_level_size; itr < top; itr++) { index++; if (itr->name == NULL) { printk(KERN_ERR "%s: itr->name NULL?? (%p, #%d)\n", tag, itr, index); continue; } if (!strcmp(itr->name, submodule)) { itr->level = level; return; } } printk(KERN_ERR "%s: unknown submodule %s\n", tag, submodule); } /** * d_parse_params - Parse a string with debug parameters from the * command line * * @d_level: level structure (D_LEVEL) * @d_level_size: number of items in the level structure * (D_LEVEL_SIZE). * @_params: string with the parameters; this is a space (not tab!) * separated list of NAME:VALUE, where value is the debug level * and NAME is the name of the submodule. * @tag: string for error messages (example: MODULE.ARGNAME). */ static inline void d_parse_params(struct d_level *d_level, size_t d_level_size, const char *_params, const char *tag) { char submodule[130], *params, *params_orig, *token, *colon; unsigned level, tokens; if (_params == NULL) return; params_orig = kstrdup(_params, GFP_KERNEL); params = params_orig; while (1) { token = strsep(&params, " "); if (token == NULL) break; if (*token == '\0') /* eat joint spaces */ continue; /* kernel's sscanf %s eats until whitespace, so we * replace : by \n so it doesn't get eaten later by * strsep */ colon = strchr(token, ':'); if (colon != NULL) *colon = '\n'; tokens = sscanf(token, "%s\n%u", submodule, &level); if (colon != NULL) *colon = ':'; /* set back, for error messages */ if (tokens == 2) d_submodule_set(d_level, d_level_size, submodule, level, tag); else printk(KERN_ERR "%s: can't parse '%s' as a " "SUBMODULE:LEVEL (%d tokens)\n", tag, token, tokens); } kfree(params_orig); } #endif /* #ifndef __debug__h__ */
2 2 2 2 2 1 1 2 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hpfs/map.c * * Mikulas Patocka (mikulas@artax.karlin.mff.cuni.cz), 1998-1999 * * mapping structures to memory with some minimal checks */ #include "hpfs_fn.h" __le32 *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh) { return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0); } __le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block, struct quad_buffer_head *qbh, char *id) { secno sec; __le32 *ret; unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; if (hpfs_sb(s)->sb_chk) if (bmp_block >= n_bands) { hpfs_error(s, "hpfs_map_bitmap called with bad parameter: %08x at %s", bmp_block, id); return NULL; } sec = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]); if (!sec || sec > hpfs_sb(s)->sb_fs_size-4) { hpfs_error(s, "invalid bitmap block pointer %08x -> %08x at %s", bmp_block, sec, id); return NULL; } ret = hpfs_map_4sectors(s, sec, qbh, 4); if (ret) hpfs_prefetch_bitmap(s, bmp_block + 1); return ret; } void hpfs_prefetch_bitmap(struct super_block *s, unsigned bmp_block) { unsigned to_prefetch, next_prefetch; unsigned n_bands = (hpfs_sb(s)->sb_fs_size + 0x3fff) >> 14; if (unlikely(bmp_block >= n_bands)) return; to_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block]); if (unlikely(bmp_block + 1 >= n_bands)) next_prefetch = 0; else next_prefetch = le32_to_cpu(hpfs_sb(s)->sb_bmp_dir[bmp_block + 1]); hpfs_prefetch_sectors(s, to_prefetch, 4 + 4 * (to_prefetch + 4 == next_prefetch)); } /* * Load first code page into kernel memory, return pointer to 256-byte array, * first 128 bytes are uppercasing table for chars 128-255, next 128 bytes are * lowercasing table */ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps) { struct buffer_head *bh; secno cpds; unsigned cpi; unsigned char *ptr; unsigned char *cp_table; int i; struct code_page_data *cpd; struct code_page_directory *cp = hpfs_map_sector(s, cps, &bh, 0); if (!cp) return NULL; if (le32_to_cpu(cp->magic) != CP_DIR_MAGIC) { pr_err("Code page directory magic doesn't match (magic = %08x)\n", le32_to_cpu(cp->magic)); brelse(bh); return NULL; } if (!le32_to_cpu(cp->n_code_pages)) { pr_err("n_code_pages == 0\n"); brelse(bh); return NULL; } cpds = le32_to_cpu(cp->array[0].code_page_data); cpi = le16_to_cpu(cp->array[0].index); brelse(bh); if (cpi >= 3) { pr_err("Code page index out of array\n"); return NULL; } if (!(cpd = hpfs_map_sector(s, cpds, &bh, 0))) return NULL; if (le16_to_cpu(cpd->offs[cpi]) > 0x178) { pr_err("Code page index out of sector\n"); brelse(bh); return NULL; } ptr = (unsigned char *)cpd + le16_to_cpu(cpd->offs[cpi]) + 6; if (!(cp_table = kmalloc(256, GFP_KERNEL))) { pr_err("out of memory for code page table\n"); brelse(bh); return NULL; } memcpy(cp_table, ptr, 128); brelse(bh); /* Try to build lowercasing table from uppercasing one */ for (i=128; i<256; i++) cp_table[i]=i; for (i=128; i<256; i++) if (cp_table[i-128]!=i && cp_table[i-128]>=128) cp_table[cp_table[i-128]] = i; return cp_table; } __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) { struct buffer_head *bh; int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21; int i; __le32 *b; if (!(b = kmalloc(n * 512, GFP_KERNEL))) { pr_err("can't allocate memory for bitmap directory\n"); return NULL; } for (i=0;i<n;i++) { __le32 *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1); if (!d) { kfree(b); return NULL; } memcpy((char *)b + 512 * i, d, 512); brelse(bh); } return b; } void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock) { struct quad_buffer_head qbh; __le32 *directory; u32 n_hotfixes, n_used_hotfixes; unsigned i; n_hotfixes = le32_to_cpu(spareblock->n_spares); n_used_hotfixes = le32_to_cpu(spareblock->n_spares_used); if (n_hotfixes > 256 || n_used_hotfixes > n_hotfixes) { hpfs_error(s, "invalid number of hotfixes: %u, used: %u", n_hotfixes, n_used_hotfixes); return; } if (!(directory = hpfs_map_4sectors(s, le32_to_cpu(spareblock->hotfix_map), &qbh, 0))) { hpfs_error(s, "can't load hotfix map"); return; } for (i = 0; i < n_used_hotfixes; i++) { hpfs_sb(s)->hotfix_from[i] = le32_to_cpu(directory[i]); hpfs_sb(s)->hotfix_to[i] = le32_to_cpu(directory[n_hotfixes + i]); } hpfs_sb(s)->n_hotfixes = n_used_hotfixes; hpfs_brelse4(&qbh); } /* * Load fnode to memory */ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_head **bhp) { struct fnode *fnode; if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ino, 1, "fnode")) { return NULL; } if ((fnode = hpfs_map_sector(s, ino, bhp, FNODE_RD_AHEAD))) { if (hpfs_sb(s)->sb_chk) { struct extended_attribute *ea; struct extended_attribute *ea_end; if (le32_to_cpu(fnode->magic) != FNODE_MAGIC) { hpfs_error(s, "bad magic on fnode %08lx", (unsigned long)ino); goto bail; } if (!fnode_is_dir(fnode)) { if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes != (bp_internal(&fnode->btree) ? 12 : 8)) { hpfs_error(s, "bad number of nodes in fnode %08lx", (unsigned long)ino); goto bail; } if (le16_to_cpu(fnode->btree.first_free) != 8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) { hpfs_error(s, "bad first_free pointer in fnode %08lx", (unsigned long)ino); goto bail; } } if (le16_to_cpu(fnode->ea_size_s) && (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200)) { hpfs_error(s, "bad EA info in fnode %08lx: ea_offs == %04x ea_size_s == %04x", (unsigned long)ino, le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s)); goto bail; } ea = fnode_ea(fnode); ea_end = fnode_end_ea(fnode); while (ea != ea_end) { if (ea > ea_end) { hpfs_error(s, "bad EA in fnode %08lx", (unsigned long)ino); goto bail; } ea = next_ea(ea); } } } return fnode; bail: brelse(*bhp); return NULL; } struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buffer_head **bhp) { struct anode *anode; if (hpfs_sb(s)->sb_chk) if (hpfs_chk_sectors(s, ano, 1, "anode")) return NULL; if ((anode = hpfs_map_sector(s, ano, bhp, ANODE_RD_AHEAD))) if (hpfs_sb(s)->sb_chk) { if (le32_to_cpu(anode->magic) != ANODE_MAGIC) { hpfs_error(s, "bad magic on anode %08x", ano); goto bail; } if (le32_to_cpu(anode->self) != ano) { hpfs_error(s, "self pointer invalid on anode %08x", ano); goto bail; } if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes != (bp_internal(&anode->btree) ? 60 : 40)) { hpfs_error(s, "bad number of nodes in anode %08x", ano); goto bail; } if (le16_to_cpu(anode->btree.first_free) != 8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) { hpfs_error(s, "bad first_free pointer in anode %08x", ano); goto bail; } } return anode; bail: brelse(*bhp); return NULL; } /* * Load dnode to memory and do some checks */ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, struct quad_buffer_head *qbh) { struct dnode *dnode; if (hpfs_sb(s)->sb_chk) { if (hpfs_chk_sectors(s, secno, 4, "dnode")) return NULL; if (secno & 3) { hpfs_error(s, "dnode %08x not byte-aligned", secno); return NULL; } } if ((dnode = hpfs_map_4sectors(s, secno, qbh, DNODE_RD_AHEAD))) if (hpfs_sb(s)->sb_chk) { unsigned p, pp = 0; unsigned char *d = (unsigned char *)dnode; int b = 0; if (le32_to_cpu(dnode->magic) != DNODE_MAGIC) { hpfs_error(s, "bad magic on dnode %08x", secno); goto bail; } if (le32_to_cpu(dnode->self) != secno) hpfs_error(s, "bad self pointer on dnode %08x self = %08x", secno, le32_to_cpu(dnode->self)); /* Check dirents - bad dirents would cause infinite loops or shooting to memory */ if (le32_to_cpu(dnode->first_free) > 2048) { hpfs_error(s, "dnode %08x has first_free == %08x", secno, le32_to_cpu(dnode->first_free)); goto bail; } for (p = 20; p < le32_to_cpu(dnode->first_free); p += d[p] + (d[p+1] << 8)) { struct hpfs_dirent *de = (struct hpfs_dirent *)((char *)dnode + p); if (le16_to_cpu(de->length) > 292 || (le16_to_cpu(de->length) < 32) || (le16_to_cpu(de->length) & 3) || p + le16_to_cpu(de->length) > 2048) { hpfs_error(s, "bad dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); goto bail; } if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) { if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok; hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); goto bail; } ok: if (hpfs_sb(s)->sb_chk >= 2) b |= 1 << de->down; if (de->down) if (de_down_pointer(de) < 0x10) { hpfs_error(s, "bad down pointer in dnode %08x, dirent %03x, last %03x", secno, p, pp); goto bail; } pp = p; } if (p != le32_to_cpu(dnode->first_free)) { hpfs_error(s, "size on last dirent does not match first_free; dnode %08x", secno); goto bail; } if (d[pp + 30] != 1 || d[pp + 31] != 255) { hpfs_error(s, "dnode %08x does not end with \\377 entry", secno); goto bail; } if (b == 3) pr_err("unbalanced dnode tree, dnode %08x; see hpfs.txt 4 more info\n", secno); } return dnode; bail: hpfs_brelse4(qbh); return NULL; } dnode_secno hpfs_fnode_dno(struct super_block *s, ino_t ino) { struct buffer_head *bh; struct fnode *fnode; dnode_secno dno; fnode = hpfs_map_fnode(s, ino, &bh); if (!fnode) return 0; dno = le32_to_cpu(fnode->u.external[0].disk_secno); brelse(bh); return dno; }
2173 475 1890 422 159 224 20 21 225 225 279 271 268 8 977 603 8 7 24 20 20 1 20 1 19 2 18 22 22 18 13 1 18 8 18 14 15 129 47 128 43 43 41 39 67 106 105 103 105 3 103 9 103 68 60 2 7 57 53 57 78 88 82 10 3 10 299 281 299 183 1 299 281 279 3 299 70 93 66 93 87 93 90 93 9 6 7 7 1942 1945 1939 4 1940 1943 1943 1942 70 70 70 70 3 70 138 138 138 138 138 138 138 138 138 137 136 1 2 138 139 52 85 139 92 12 4 14 9 21 29 20 20 20 29 9 9 4 3 15 15 15 11 4 28 1 8 8 7 6 6 4 5 5 4 2 1 2 1 1 4 1 459 2 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/pipe.c * * Copyright (C) 1991, 1992, 1999 Linus Torvalds */ #include <linux/mm.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/mount.h> #include <linux/magic.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <linux/memcontrol.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include "internal.h" /* * New pipe buffers will be restricted to this size while the user is exceeding * their pipe buffer quota. The general pipe use case needs at least two * buffers: one for data yet to be read, and one for new data. If this is less * than two, then a write to a non-empty pipe may block even if the pipe is not * full. This can occur with GNU make jobserver or similar uses of pipes as * semaphores: multiple processes may be waiting to write tokens back to the * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/. * * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their * own risk, namely: pipe writes to non-full pipes may block until the pipe is * emptied. */ #define PIPE_MIN_DEF_BUFFERS 2 /* * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size */ unsigned int pipe_max_size = 1048576; /* * Minimum pipe size, as required by POSIX */ unsigned int pipe_min_size = PAGE_SIZE; /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ unsigned long pipe_user_pages_hard; unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* * We use a start+len construction, which provides full use of the * allocated memory. * -- Florian Coosmann (FGC) * * Reads with count = 0 should always return 0. * -- Julian Bradfield 1999-06-07. * * FIFOs and Pipes now generate SIGIO for both readers and writers. * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 * * pipe_read & write cleanup * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass) { if (pipe->files) mutex_lock_nested(&pipe->mutex, subclass); } void pipe_lock(struct pipe_inode_info *pipe) { /* * pipe_lock() nests non-pipe inode locks (for writing to a file) */ pipe_lock_nested(pipe, I_MUTEX_PARENT); } EXPORT_SYMBOL(pipe_lock); void pipe_unlock(struct pipe_inode_info *pipe) { if (pipe->files) mutex_unlock(&pipe->mutex); } EXPORT_SYMBOL(pipe_unlock); static inline void __pipe_lock(struct pipe_inode_info *pipe) { mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT); } static inline void __pipe_unlock(struct pipe_inode_info *pipe) { mutex_unlock(&pipe->mutex); } void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); if (pipe1 < pipe2) { pipe_lock_nested(pipe1, I_MUTEX_PARENT); pipe_lock_nested(pipe2, I_MUTEX_CHILD); } else { pipe_lock_nested(pipe2, I_MUTEX_PARENT); pipe_lock_nested(pipe1, I_MUTEX_CHILD); } } /* Drop the inode semaphore and wait for a pipe event, atomically */ void pipe_wait(struct pipe_inode_info *pipe) { DEFINE_WAIT(wait); /* * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE); pipe_unlock(pipe); schedule(); finish_wait(&pipe->wait, &wait); pipe_lock(pipe); } static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep * allocation cache. (Otherwise just release our reference to it) */ if (page_count(page) == 1 && !pipe->tmp_page) pipe->tmp_page = page; else put_page(page); } static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; if (page_count(page) == 1) { if (memcg_kmem_enabled()) memcg_kmem_uncharge(page, 0); __SetPageLocked(page); return 0; } return 1; } /** * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal * * Description: * This function attempts to steal the &struct page attached to * @buf. If successful, this function returns 0 and returns with * the page locked. The caller may then reuse the page for whatever * he wishes; the typical use is insertion into a different file * page cache. */ int generic_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; /* * A reference of one is golden, that means that the owner of this * page is the only one holding a reference to it. lock the page * and return OK. */ if (page_count(page) == 1) { lock_page(page); return 0; } return 1; } EXPORT_SYMBOL(generic_pipe_buf_steal); /** * generic_pipe_buf_get - get a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to * * Description: * This function grabs an extra reference to @buf. It's used in * in the tee() system call, when we duplicate the buffers in one * pipe into another. */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); /** * generic_pipe_buf_confirm - verify contents of the pipe buffer * @info: the pipe that the buffer belongs to * @buf: the buffer to confirm * * Description: * This function does nothing, because the generic pipe code uses * pages that are always good when inserted into the pipe. */ int generic_pipe_buf_confirm(struct pipe_inode_info *info, struct pipe_buffer *buf) { return 0; } EXPORT_SYMBOL(generic_pipe_buf_confirm); /** * generic_pipe_buf_release - put a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to put a reference to * * Description: * This function releases a reference to @buf. */ void generic_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; static const struct pipe_buf_operations anon_pipe_buf_nomerge_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; static const struct pipe_buf_operations packet_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, .get = generic_pipe_buf_get, }; void pipe_buf_mark_unmergeable(struct pipe_buffer *buf) { if (buf->ops == &anon_pipe_buf_ops) buf->ops = &anon_pipe_buf_nomerge_ops; } static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; int do_wakeup; ssize_t ret; /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; do_wakeup = 0; ret = 0; __pipe_lock(pipe); for (;;) { int bufs = pipe->nrbufs; if (bufs) { int curbuf = pipe->curbuf; struct pipe_buffer *buf = pipe->bufs + curbuf; size_t chars = buf->len; size_t written; int error; if (chars > total_len) chars = total_len; error = pipe_buf_confirm(pipe, buf); if (error) { if (!ret) ret = error; break; } written = copy_page_to_iter(buf->page, buf->offset, chars, to); if (unlikely(written < chars)) { if (!ret) ret = -EFAULT; break; } ret += chars; buf->offset += chars; buf->len -= chars; /* Was it a packet buffer? Clean up and exit */ if (buf->flags & PIPE_BUF_FLAG_PACKET) { total_len = chars; buf->len = 0; } if (!buf->len) { pipe_buf_release(pipe, buf); curbuf = (curbuf + 1) & (pipe->buffers - 1); pipe->curbuf = curbuf; pipe->nrbufs = --bufs; do_wakeup = 1; } total_len -= chars; if (!total_len) break; /* common path: read succeeded */ } if (bufs) /* More to do? */ continue; if (!pipe->writers) break; if (!pipe->waiting_writers) { /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then * we can wait for that data without violating POSIX. */ if (ret) break; if (filp->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } pipe_wait(pipe); } __pipe_unlock(pipe); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLOUT | POLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); return ret; } static inline int is_packetized(struct file *file) { return (file->f_flags & O_DIRECT) != 0; } static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; ssize_t ret = 0; int do_wakeup = 0; size_t total_len = iov_iter_count(from); ssize_t chars; /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; __pipe_lock(pipe); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ if (pipe->nrbufs && chars != 0) { int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (pipe->buffers - 1); struct pipe_buffer *buf = pipe->bufs + lastbuf; int offset = buf->offset + buf->len; if (buf->ops->can_merge && offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; ret = copy_page_from_iter(buf->page, offset, chars, from); if (unlikely(ret < chars)) { ret = -EFAULT; goto out; } do_wakeup = 1; buf->len += ret; if (!iov_iter_count(from)) goto out; } } for (;;) { int bufs; if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } bufs = pipe->nrbufs; if (bufs < pipe->buffers) { int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pipe->tmp_page; int copied; if (!page) { page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; } pipe->tmp_page = page; } /* Always wake up, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to * syscall merging. * FIXME! Is this really true? */ do_wakeup = 1; copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) ret = -EFAULT; break; } ret += copied; /* Insert it into the buffer array */ buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = copied; buf->flags = 0; if (is_packetized(filp)) { buf->ops = &packet_pipe_buf_ops; buf->flags = PIPE_BUF_FLAG_PACKET; } pipe->nrbufs = ++bufs; pipe->tmp_page = NULL; if (!iov_iter_count(from)) break; } if (bufs < pipe->buffers) continue; if (filp->f_flags & O_NONBLOCK) { if (!ret) ret = -EAGAIN; break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } pipe->waiting_writers++; pipe_wait(pipe); pipe->waiting_writers--; } out: __pipe_unlock(pipe); if (do_wakeup) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; sb_end_write(file_inode(filp)->i_sb); } return ret; } static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; int count, buf, nrbufs; switch (cmd) { case FIONREAD: __pipe_lock(pipe); count = 0; buf = pipe->curbuf; nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { count += pipe->bufs[buf].len; buf = (buf+1) & (pipe->buffers - 1); } __pipe_unlock(pipe); return put_user(count, (int __user *)arg); default: return -ENOIOCTLCMD; } } /* No kernel lock held - fine */ static unsigned int pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; struct pipe_inode_info *pipe = filp->private_data; int nrbufs; poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ nrbufs = pipe->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= POLLHUP; } if (filp->f_mode & FMODE_WRITE) { mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0; /* * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ if (!pipe->readers) mask |= POLLERR; } return mask; } static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) { int kill = 0; spin_lock(&inode->i_lock); if (!--pipe->files) { inode->i_pipe = NULL; kill = 1; } spin_unlock(&inode->i_lock); if (kill) free_pipe_info(pipe); } static int pipe_release(struct inode *inode, struct file *file) { struct pipe_inode_info *pipe = file->private_data; __pipe_lock(pipe); if (file->f_mode & FMODE_READ) pipe->readers--; if (file->f_mode & FMODE_WRITE) pipe->writers--; if (pipe->readers || pipe->writers) { wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } __pipe_unlock(pipe); put_pipe_info(inode, pipe); return 0; } static int pipe_fasync(int fd, struct file *filp, int on) { struct pipe_inode_info *pipe = filp->private_data; int retval = 0; __pipe_lock(pipe); if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); if (retval < 0 && (filp->f_mode & FMODE_READ)) /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } __pipe_unlock(pipe); return retval; } static unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new) { return atomic_long_add_return(new - old, &user->pipe_bufs); } static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { return pipe_user_pages_soft && user_bufs > pipe_user_pages_soft; } static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { return pipe_user_pages_hard && user_bufs > pipe_user_pages_hard; } static bool is_unprivileged_user(void) { return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); unsigned long user_bufs; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; if (pipe_bufs * PAGE_SIZE > pipe_max_size && !capable(CAP_SYS_RESOURCE)) pipe_bufs = pipe_max_size >> PAGE_SHIFT; user_bufs = account_pipe_buffers(user, 0, pipe_bufs); if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS); pipe_bufs = PIPE_MIN_DEF_BUFFERS; } if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; pipe->buffers = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); return pipe; } out_revert_acct: (void) account_pipe_buffers(user, pipe_bufs, 0); kfree(pipe); out_free_uid: free_uid(user); return NULL; } void free_pipe_info(struct pipe_inode_info *pipe) { int i; (void) account_pipe_buffers(pipe->user, pipe->buffers, 0); free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) pipe_buf_release(pipe, buf); } if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); kfree(pipe); } static struct vfsmount *pipe_mnt __read_mostly; /* * pipefs_dname() is called from d_path(). */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", d_inode(dentry)->i_ino); } static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) goto fail_inode; inode->i_ino = get_next_ino(); pipe = alloc_pipe_info(); if (!pipe) goto fail_iput; inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; inode->i_fop = &pipefifo_fops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because "mark_inode_dirty()" will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); return inode; fail_iput: iput(inode); fail_inode: return NULL; } int create_pipe_files(struct file **res, int flags) { int err; struct inode *inode = get_pipe_inode(); struct file *f; struct path path; if (!inode) return -ENFILE; err = -ENOMEM; path.dentry = d_alloc_pseudo(pipe_mnt->mnt_sb, &empty_name); if (!path.dentry) goto err_inode; path.mnt = mntget(pipe_mnt); d_instantiate(path.dentry, inode); f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops); if (IS_ERR(f)) { err = PTR_ERR(f); goto err_dentry; } f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); f->private_data = inode->i_pipe; res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops); if (IS_ERR(res[0])) { err = PTR_ERR(res[0]); goto err_file; } path_get(&path); res[0]->private_data = inode->i_pipe; res[0]->f_flags = O_RDONLY | (flags & O_NONBLOCK); res[1] = f; return 0; err_file: put_filp(f); err_dentry: free_pipe_info(inode->i_pipe); path_put(&path); return err; err_inode: free_pipe_info(inode->i_pipe); iput(inode); return err; } static int __do_pipe_flags(int *fd, struct file **files, int flags) { int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) return -EINVAL; error = create_pipe_files(files, flags); if (error) return error; error = get_unused_fd_flags(flags); if (error < 0) goto err_read_pipe; fdr = error; error = get_unused_fd_flags(flags); if (error < 0) goto err_fdr; fdw = error; audit_fd_pair(fdr, fdw); fd[0] = fdr; fd[1] = fdw; return 0; err_fdr: put_unused_fd(fdr); err_read_pipe: fput(files[0]); fput(files[1]); return error; } int do_pipe_flags(int *fd, int flags) { struct file *files[2]; int error = __do_pipe_flags(fd, files, flags); if (!error) { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } return error; } /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { struct file *files[2]; int fd[2]; int error; error = __do_pipe_flags(fd, files, flags); if (!error) { if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { fput(files[0]); fput(files[1]); put_unused_fd(fd[0]); put_unused_fd(fd[1]); error = -EFAULT; } else { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } } return error; } SYSCALL_DEFINE1(pipe, int __user *, fildes) { return sys_pipe2(fildes, 0); } static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { int cur = *cnt; while (cur == *cnt) { pipe_wait(pipe); if (signal_pending(current)) break; } return cur == *cnt ? -ERESTARTSYS : 0; } static void wake_up_partner(struct pipe_inode_info *pipe) { wake_up_interruptible(&pipe->wait); } static int fifo_open(struct inode *inode, struct file *filp) { struct pipe_inode_info *pipe; bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_version = 0; spin_lock(&inode->i_lock); if (inode->i_pipe) { pipe = inode->i_pipe; pipe->files++; spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; pipe->files = 1; spin_lock(&inode->i_lock); if (unlikely(inode->i_pipe)) { inode->i_pipe->files++; spin_unlock(&inode->i_lock); free_pipe_info(pipe); pipe = inode->i_pipe; } else { inode->i_pipe = pipe; spin_unlock(&inode->i_lock); } } filp->private_data = pipe; /* OK, we have a pipe and it's pinned down */ __pipe_lock(pipe); /* We can only do regular read/write on fifos */ filp->f_mode &= (FMODE_READ | FMODE_WRITE); switch (filp->f_mode) { case FMODE_READ: /* * O_RDONLY * POSIX.1 says that O_NONBLOCK means return with the FIFO * opened, even when there is no process writing the FIFO. */ pipe->r_counter++; if (pipe->readers++ == 0) wake_up_partner(pipe); if (!is_pipe && !pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress POLLHUP until we have * seen a writer */ filp->f_version = pipe->w_counter; } else { if (wait_for_partner(pipe, &pipe->w_counter)) goto err_rd; } } break; case FMODE_WRITE: /* * O_WRONLY * POSIX.1 says that O_NONBLOCK means return -1 with * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; pipe->w_counter++; if (!pipe->writers++) wake_up_partner(pipe); if (!is_pipe && !pipe->readers) { if (wait_for_partner(pipe, &pipe->r_counter)) goto err_wr; } break; case FMODE_READ | FMODE_WRITE: /* * O_RDWR * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. * This implementation will NEVER block on a O_RDWR open, since * the process can at least talk to itself. */ pipe->readers++; pipe->writers++; pipe->r_counter++; pipe->w_counter++; if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(pipe); break; default: ret = -EINVAL; goto err; } /* Ok! */ __pipe_unlock(pipe); return 0; err_rd: if (!--pipe->readers) wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err_wr: if (!--pipe->writers) wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err: __pipe_unlock(pipe); put_pipe_info(inode, pipe); return ret; } const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, }; /* * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ static inline unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; if (size < pipe_min_size) size = pipe_min_size; nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; if (nr_pages == 0) return 0; return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } /* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { struct pipe_buffer *bufs; unsigned int size, nr_pages; unsigned long user_bufs; long ret = 0; size = round_pipe_size(arg); if (size == 0) return -EINVAL; nr_pages = size >> PAGE_SHIFT; if (!nr_pages) return -EINVAL; /* * If trying to increase the pipe capacity, check that an * unprivileged user is not trying to exceed various limits * (soft limit check here, hard limit check just below). * Decreasing the pipe capacity is always permitted, even * if the user is currently over a limit. */ if (nr_pages > pipe->buffers && size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) return -EPERM; user_bufs = account_pipe_buffers(pipe->user, pipe->buffers, nr_pages); if (nr_pages > pipe->buffers && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && is_unprivileged_user()) { ret = -EPERM; goto out_revert_acct; } /* * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't * expect a lot of shrink+grow operations, just free and allocate * again like we would do for growing. If the pipe currently * contains more buffers than arg, then return busy. */ if (nr_pages < pipe->nrbufs) { ret = -EBUSY; goto out_revert_acct; } bufs = kcalloc(nr_pages, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) { ret = -ENOMEM; goto out_revert_acct; } /* * The pipe array wraps around, so just start the new one at zero * and adjust the indexes. */ if (pipe->nrbufs) { unsigned int tail; unsigned int head; tail = pipe->curbuf + pipe->nrbufs; if (tail < pipe->buffers) tail = 0; else tail &= (pipe->buffers - 1); head = pipe->nrbufs - tail; if (head) memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer)); if (tail) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; pipe->buffers = nr_pages; return nr_pages * PAGE_SIZE; out_revert_acct: (void) account_pipe_buffers(pipe->user, nr_pages, pipe->buffers); return ret; } /* * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax * will return an error. */ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { unsigned int rounded_pipe_max_size; int ret; ret = proc_douintvec_minmax(table, write, buf, lenp, ppos); if (ret < 0 || !write) return ret; rounded_pipe_max_size = round_pipe_size(pipe_max_size); if (rounded_pipe_max_size == 0) return -EINVAL; pipe_max_size = rounded_pipe_max_size; return ret; } /* * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same * location, so checking ->i_pipe is not enough to verify that this is a * pipe. */ struct pipe_inode_info *get_pipe_info(struct file *file) { return file->f_op == &pipefifo_fops ? file->private_data : NULL; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe; long ret; pipe = get_pipe_info(file); if (!pipe) return -EBADF; __pipe_lock(pipe); switch (cmd) { case F_SETPIPE_SZ: ret = pipe_set_size(pipe, arg); break; case F_GETPIPE_SZ: ret = pipe->buffers * PAGE_SIZE; break; default: ret = -EINVAL; break; } __pipe_unlock(pipe); return ret; } static const struct super_operations pipefs_ops = { .destroy_inode = free_inode_nonrcu, .statfs = simple_statfs, }; /* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ static struct dentry *pipefs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_pseudo(fs_type, "pipe:", &pipefs_ops, &pipefs_dentry_operations, PIPEFS_MAGIC); } static struct file_system_type pipe_fs_type = { .name = "pipefs", .mount = pipefs_mount, .kill_sb = kill_anon_super, }; static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { err = PTR_ERR(pipe_mnt); unregister_filesystem(&pipe_fs_type); } } return err; } fs_initcall(init_pipe_fs);
341 341 330 330 98 98 192 191 341 39 39 39 39 39 24 18 6 39 337 337 309 331 337 337 337 337 337 337 249 129 129 337 337 340 297 340 340 340 337 337 337 2 340 340 2 2 291 231 231 291 45 46 4 1 46 46 45 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/bfind.c * * Copyright (C) 2001 * Brad Boyer (flar@allandria.com) * (C) 2003 Ardis Technologies <roman@ardistech.com> * * Search routines for btrees */ #include <linux/slab.h> #include "hfsplus_fs.h" int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd) { void *ptr; fd->tree = tree; fd->bnode = NULL; ptr = kmalloc(tree->max_key_len * 2 + 4, GFP_KERNEL); if (!ptr) return -ENOMEM; fd->search_key = ptr; fd->key = ptr + tree->max_key_len + 2; hfs_dbg(BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); switch (tree->cnid) { case HFSPLUS_CAT_CNID: mutex_lock_nested(&tree->tree_lock, CATALOG_BTREE_MUTEX); break; case HFSPLUS_EXT_CNID: mutex_lock_nested(&tree->tree_lock, EXTENTS_BTREE_MUTEX); break; case HFSPLUS_ATTR_CNID: mutex_lock_nested(&tree->tree_lock, ATTR_BTREE_MUTEX); break; default: BUG(); } return 0; } void hfs_find_exit(struct hfs_find_data *fd) { hfs_bnode_put(fd->bnode); kfree(fd->search_key); hfs_dbg(BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); mutex_unlock(&fd->tree->tree_lock); fd->tree = NULL; } int hfs_find_1st_rec_by_cnid(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { __be32 cur_cnid; __be32 search_cnid; if (bnode->tree->cnid == HFSPLUS_EXT_CNID) { cur_cnid = fd->key->ext.cnid; search_cnid = fd->search_key->ext.cnid; } else if (bnode->tree->cnid == HFSPLUS_CAT_CNID) { cur_cnid = fd->key->cat.parent; search_cnid = fd->search_key->cat.parent; } else if (bnode->tree->cnid == HFSPLUS_ATTR_CNID) { cur_cnid = fd->key->attr.cnid; search_cnid = fd->search_key->attr.cnid; } else { cur_cnid = 0; /* used-uninitialized warning */ search_cnid = 0; BUG(); } if (cur_cnid == search_cnid) { (*end) = (*cur_rec); if ((*begin) == (*end)) return 1; } else { if (be32_to_cpu(cur_cnid) < be32_to_cpu(search_cnid)) (*begin) = (*cur_rec) + 1; else (*end) = (*cur_rec) - 1; } return 0; } int hfs_find_rec_by_key(struct hfs_bnode *bnode, struct hfs_find_data *fd, int *begin, int *end, int *cur_rec) { int cmpval; cmpval = bnode->tree->keycmp(fd->key, fd->search_key); if (!cmpval) { (*end) = (*cur_rec); return 1; } if (cmpval < 0) (*begin) = (*cur_rec) + 1; else *(end) = (*cur_rec) - 1; return 0; } /* Find the record in bnode that best matches key (not greater than...)*/ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, search_strategy_t rec_found) { u16 off, len, keylen; int rec; int b, e; int res; BUG_ON(!rec_found); b = 0; e = bnode->num_recs - 1; res = -ENOENT; do { rec = (e + b) / 2; len = hfs_brec_lenoff(bnode, rec, &off); keylen = hfs_brec_keylen(bnode, rec); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); if (rec_found(bnode, fd, &b, &e, &rec)) { res = 0; goto done; } } while (b <= e); if (rec != e && e >= 0) { len = hfs_brec_lenoff(bnode, e, &off); keylen = hfs_brec_keylen(bnode, e); if (keylen == 0) { res = -EINVAL; goto fail; } hfs_bnode_read(bnode, fd->key, off, keylen); } done: fd->record = e; fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; fail: return res; } /* Traverse a B*Tree from the root to a leaf finding best fit to key */ /* Return allocated copy of node found, set recnum to best record */ int hfs_brec_find(struct hfs_find_data *fd, search_strategy_t do_key_compare) { struct hfs_btree *tree; struct hfs_bnode *bnode; u32 nidx, parent; __be32 data; int height, res; tree = fd->tree; if (fd->bnode) hfs_bnode_put(fd->bnode); fd->bnode = NULL; nidx = tree->root; if (!nidx) return -ENOENT; height = tree->depth; res = 0; parent = 0; for (;;) { bnode = hfs_bnode_find(tree, nidx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; break; } if (bnode->height != height) goto invalid; if (bnode->type != (--height ? HFS_NODE_INDEX : HFS_NODE_LEAF)) goto invalid; bnode->parent = parent; res = __hfs_brec_find(bnode, fd, do_key_compare); if (!height) break; if (fd->record < 0) goto release; parent = nidx; hfs_bnode_read(bnode, &data, fd->entryoffset, 4); nidx = be32_to_cpu(data); hfs_bnode_put(bnode); } fd->bnode = bnode; return res; invalid: pr_err("inconsistency in B*Tree (%d,%d,%d,%u,%u)\n", height, bnode->height, bnode->type, nidx, parent); res = -EIO; release: hfs_bnode_put(bnode); return res; } int hfs_brec_read(struct hfs_find_data *fd, void *rec, int rec_len) { int res; res = hfs_brec_find(fd, hfs_find_rec_by_key); if (res) return res; if (fd->entrylength > rec_len) return -EINVAL; hfs_bnode_read(fd->bnode, rec, fd->entryoffset, fd->entrylength); return 0; } int hfs_brec_goto(struct hfs_find_data *fd, int cnt) { struct hfs_btree *tree; struct hfs_bnode *bnode; int idx, res = 0; u16 off, len, keylen; bnode = fd->bnode; tree = bnode->tree; if (cnt < 0) { cnt = -cnt; while (cnt > fd->record) { cnt -= fd->record + 1; fd->record = bnode->num_recs - 1; idx = bnode->prev; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record -= cnt; } else { while (cnt >= bnode->num_recs - fd->record) { cnt -= bnode->num_recs - fd->record; fd->record = 0; idx = bnode->next; if (!idx) { res = -ENOENT; goto out; } hfs_bnode_put(bnode); bnode = hfs_bnode_find(tree, idx); if (IS_ERR(bnode)) { res = PTR_ERR(bnode); bnode = NULL; goto out; } } fd->record += cnt; } len = hfs_brec_lenoff(bnode, fd->record, &off); keylen = hfs_brec_keylen(bnode, fd->record); if (keylen == 0) { res = -EINVAL; goto out; } fd->keyoffset = off; fd->keylength = keylen; fd->entryoffset = off + keylen; fd->entrylength = len - keylen; hfs_bnode_read(bnode, fd->key, off, keylen); out: fd->bnode = bnode; return res; }
23 24 24 23 24 24 11 9 8 9 4 4 3 4 1 1 9 8 4 22 7 22 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 /* * VMware VMCI Driver * * Copyright (C) 2012 VMware, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation version 2 and no later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. */ #include <linux/slab.h> #include "vmci_handle_array.h" static size_t handle_arr_calc_size(u32 capacity) { return VMCI_HANDLE_ARRAY_HEADER_SIZE + capacity * sizeof(struct vmci_handle); } struct vmci_handle_arr *vmci_handle_arr_create(u32 capacity, u32 max_capacity) { struct vmci_handle_arr *array; if (max_capacity == 0 || capacity > max_capacity) return NULL; if (capacity == 0) capacity = min((u32)VMCI_HANDLE_ARRAY_DEFAULT_CAPACITY, max_capacity); array = kmalloc(handle_arr_calc_size(capacity), GFP_ATOMIC); if (!array) return NULL; array->capacity = capacity; array->max_capacity = max_capacity; array->size = 0; return array; } void vmci_handle_arr_destroy(struct vmci_handle_arr *array) { kfree(array); } int vmci_handle_arr_append_entry(struct vmci_handle_arr **array_ptr, struct vmci_handle handle) { struct vmci_handle_arr *array = *array_ptr; if (unlikely(array->size >= array->capacity)) { /* reallocate. */ struct vmci_handle_arr *new_array; u32 capacity_bump = min(array->max_capacity - array->capacity, array->capacity); size_t new_size = handle_arr_calc_size(array->capacity + capacity_bump); if (array->size >= array->max_capacity) return VMCI_ERROR_NO_MEM; new_array = krealloc(array, new_size, GFP_ATOMIC); if (!new_array) return VMCI_ERROR_NO_MEM; new_array->capacity += capacity_bump; *array_ptr = array = new_array; } array->entries[array->size] = handle; array->size++; return VMCI_SUCCESS; } /* * Handle that was removed, VMCI_INVALID_HANDLE if entry not found. */ struct vmci_handle vmci_handle_arr_remove_entry(struct vmci_handle_arr *array, struct vmci_handle entry_handle) { struct vmci_handle handle = VMCI_INVALID_HANDLE; u32 i; for (i = 0; i < array->size; i++) { if (vmci_handle_is_equal(array->entries[i], entry_handle)) { handle = array->entries[i]; array->size--; array->entries[i] = array->entries[array->size]; array->entries[array->size] = VMCI_INVALID_HANDLE; break; } } return handle; } /* * Handle that was removed, VMCI_INVALID_HANDLE if array was empty. */ struct vmci_handle vmci_handle_arr_remove_tail(struct vmci_handle_arr *array) { struct vmci_handle handle = VMCI_INVALID_HANDLE; if (array->size) { array->size--; handle = array->entries[array->size]; array->entries[array->size] = VMCI_INVALID_HANDLE; } return handle; } /* * Handle at given index, VMCI_INVALID_HANDLE if invalid index. */ struct vmci_handle vmci_handle_arr_get_entry(const struct vmci_handle_arr *array, u32 index) { if (unlikely(index >= array->size)) return VMCI_INVALID_HANDLE; return array->entries[index]; } bool vmci_handle_arr_has_entry(const struct vmci_handle_arr *array, struct vmci_handle entry_handle) { u32 i; for (i = 0; i < array->size; i++) if (vmci_handle_is_equal(array->entries[i], entry_handle)) return true; return false; } /* * NULL if the array is empty. Otherwise, a pointer to the array * of VMCI handles in the handle array. */ struct vmci_handle *vmci_handle_arr_get_handles(struct vmci_handle_arr *array) { if (array->size) return array->entries; return NULL; }
10 10 38 38 12 16 16 16 16 16 16 16 16 3 1 3 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 /* * Copyright 2011 Red Hat, Inc. * Copyright © 2014 The Chromium OS Authors * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software") * to deal in the software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * them Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTIBILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER * IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Adam Jackson <ajax@redhat.com> * Ben Widawsky <ben@bwidawsk.net> */ /** * This is vgem, a (non-hardware-backed) GEM service. This is used by Mesa's * software renderer and the X server for efficient buffer sharing. */ #include <linux/module.h> #include <linux/ramfs.h> #include <linux/shmem_fs.h> #include <linux/dma-buf.h> #include "vgem_drv.h" #define DRIVER_NAME "vgem" #define DRIVER_DESC "Virtual GEM provider" #define DRIVER_DATE "20120112" #define DRIVER_MAJOR 1 #define DRIVER_MINOR 0 static struct vgem_device { struct drm_device drm; struct platform_device *platform; } *vgem_device; static void vgem_gem_free_object(struct drm_gem_object *obj) { struct drm_vgem_gem_object *vgem_obj = to_vgem_bo(obj); kvfree(vgem_obj->pages); mutex_destroy(&vgem_obj->pages_lock); if (obj->import_attach) drm_prime_gem_destroy(obj, vgem_obj->table); drm_gem_object_release(obj); kfree(vgem_obj); } static int vgem_gem_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct drm_vgem_gem_object *obj = vma->vm_private_data; /* We don't use vmf->pgoff since that has the fake offset */ unsigned long vaddr = vmf->address; int ret; loff_t num_pages; pgoff_t page_offset; page_offset = (vaddr - vma->vm_start) >> PAGE_SHIFT; num_pages = DIV_ROUND_UP(obj->base.size, PAGE_SIZE); if (page_offset > num_pages) return VM_FAULT_SIGBUS; ret = -ENOENT; mutex_lock(&obj->pages_lock); if (obj->pages) { get_page(obj->pages[page_offset]); vmf->page = obj->pages[page_offset]; ret = 0; } mutex_unlock(&obj->pages_lock); if (ret) { struct page *page; page = shmem_read_mapping_page( file_inode(obj->base.filp)->i_mapping, page_offset); if (!IS_ERR(page)) { vmf->page = page; ret = 0; } else switch (PTR_ERR(page)) { case -ENOSPC: case -ENOMEM: ret = VM_FAULT_OOM; break; case -EBUSY: ret = VM_FAULT_RETRY; break; case -EFAULT: case -EINVAL: ret = VM_FAULT_SIGBUS; break; default: WARN_ON(PTR_ERR(page)); ret = VM_FAULT_SIGBUS; break; } } return ret; } static const struct vm_operations_struct vgem_gem_vm_ops = { .fault = vgem_gem_fault, .open = drm_gem_vm_open, .close = drm_gem_vm_close, }; static int vgem_open(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile; int ret; vfile = kzalloc(sizeof(*vfile), GFP_KERNEL); if (!vfile) return -ENOMEM; file->driver_priv = vfile; ret = vgem_fence_open(vfile); if (ret) { kfree(vfile); return ret; } return 0; } static void vgem_postclose(struct drm_device *dev, struct drm_file *file) { struct vgem_file *vfile = file->driver_priv; vgem_fence_close(vfile); kfree(vfile); } static struct drm_vgem_gem_object *__vgem_gem_create(struct drm_device *dev, unsigned long size) { struct drm_vgem_gem_object *obj; int ret; obj = kzalloc(sizeof(*obj), GFP_KERNEL); if (!obj) return ERR_PTR(-ENOMEM); ret = drm_gem_object_init(dev, &obj->base, roundup(size, PAGE_SIZE)); if (ret) { kfree(obj); return ERR_PTR(ret); } mutex_init(&obj->pages_lock); return obj; } static void __vgem_gem_destroy(struct drm_vgem_gem_object *obj) { drm_gem_object_release(&obj->base); kfree(obj); } static struct drm_gem_object *vgem_gem_create(struct drm_device *dev, struct drm_file *file, unsigned int *handle, unsigned long size) { struct drm_vgem_gem_object *obj; int ret; obj = __vgem_gem_create(dev, size); if (IS_ERR(obj)) return ERR_CAST(obj); ret = drm_gem_handle_create(file, &obj->base, handle); if (ret) { drm_gem_object_put_unlocked(&obj->base); return ERR_PTR(ret); } return &obj->base; } static int vgem_gem_dumb_create(struct drm_file *file, struct drm_device *dev, struct drm_mode_create_dumb *args) { struct drm_gem_object *gem_object; u64 pitch, size; pitch = args->width * DIV_ROUND_UP(args->bpp, 8); size = args->height * pitch; if (size == 0) return -EINVAL; gem_object = vgem_gem_create(dev, file, &args->handle, size); if (IS_ERR(gem_object)) return PTR_ERR(gem_object); args->size = gem_object->size; args->pitch = pitch; drm_gem_object_put_unlocked(gem_object); DRM_DEBUG_DRIVER("Created object of size %llu\n", args->size); return 0; } static struct drm_ioctl_desc vgem_ioctls[] = { DRM_IOCTL_DEF_DRV(VGEM_FENCE_ATTACH, vgem_fence_attach_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(VGEM_FENCE_SIGNAL, vgem_fence_signal_ioctl, DRM_AUTH|DRM_RENDER_ALLOW), }; static int vgem_mmap(struct file *filp, struct vm_area_struct *vma) { unsigned long flags = vma->vm_flags; int ret; ret = drm_gem_mmap(filp, vma); if (ret) return ret; /* Keep the WC mmaping set by drm_gem_mmap() but our pages * are ordinary and not special. */ vma->vm_flags = flags | VM_DONTEXPAND | VM_DONTDUMP; return 0; } static const struct file_operations vgem_driver_fops = { .owner = THIS_MODULE, .open = drm_open, .mmap = vgem_mmap, .poll = drm_poll, .read = drm_read, .unlocked_ioctl = drm_ioctl, .compat_ioctl = drm_compat_ioctl, .release = drm_release, }; static struct page **vgem_pin_pages(struct drm_vgem_gem_object *bo) { mutex_lock(&bo->pages_lock); if (bo->pages_pin_count++ == 0) { struct page **pages; pages = drm_gem_get_pages(&bo->base); if (IS_ERR(pages)) { bo->pages_pin_count--; mutex_unlock(&bo->pages_lock); return pages; } bo->pages = pages; } mutex_unlock(&bo->pages_lock); return bo->pages; } static void vgem_unpin_pages(struct drm_vgem_gem_object *bo) { mutex_lock(&bo->pages_lock); if (--bo->pages_pin_count == 0) { drm_gem_put_pages(&bo->base, bo->pages, true, true); bo->pages = NULL; } mutex_unlock(&bo->pages_lock); } static int vgem_prime_pin(struct drm_gem_object *obj) { struct drm_vgem_gem_object *bo = to_vgem_bo(obj); long n_pages = obj->size >> PAGE_SHIFT; struct page **pages; pages = vgem_pin_pages(bo); if (IS_ERR(pages)) return PTR_ERR(pages); /* Flush the object from the CPU cache so that importers can rely * on coherent indirect access via the exported dma-address. */ drm_clflush_pages(pages, n_pages); return 0; } static void vgem_prime_unpin(struct drm_gem_object *obj) { struct drm_vgem_gem_object *bo = to_vgem_bo(obj); vgem_unpin_pages(bo); } static struct sg_table *vgem_prime_get_sg_table(struct drm_gem_object *obj) { struct drm_vgem_gem_object *bo = to_vgem_bo(obj); return drm_prime_pages_to_sg(bo->pages, bo->base.size >> PAGE_SHIFT); } static struct drm_gem_object* vgem_prime_import(struct drm_device *dev, struct dma_buf *dma_buf) { struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm); return drm_gem_prime_import_dev(dev, dma_buf, &vgem->platform->dev); } static struct drm_gem_object *vgem_prime_import_sg_table(struct drm_device *dev, struct dma_buf_attachment *attach, struct sg_table *sg) { struct drm_vgem_gem_object *obj; int npages; obj = __vgem_gem_create(dev, attach->dmabuf->size); if (IS_ERR(obj)) return ERR_CAST(obj); npages = PAGE_ALIGN(attach->dmabuf->size) / PAGE_SIZE; obj->table = sg; obj->pages = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); if (!obj->pages) { __vgem_gem_destroy(obj); return ERR_PTR(-ENOMEM); } obj->pages_pin_count++; /* perma-pinned */ drm_prime_sg_to_page_addr_arrays(obj->table, obj->pages, NULL, npages); return &obj->base; } static void *vgem_prime_vmap(struct drm_gem_object *obj) { struct drm_vgem_gem_object *bo = to_vgem_bo(obj); long n_pages = obj->size >> PAGE_SHIFT; struct page **pages; pages = vgem_pin_pages(bo); if (IS_ERR(pages)) return NULL; return vmap(pages, n_pages, 0, pgprot_writecombine(PAGE_KERNEL)); } static void vgem_prime_vunmap(struct drm_gem_object *obj, void *vaddr) { struct drm_vgem_gem_object *bo = to_vgem_bo(obj); vunmap(vaddr); vgem_unpin_pages(bo); } static int vgem_prime_mmap(struct drm_gem_object *obj, struct vm_area_struct *vma) { int ret; if (obj->size < vma->vm_end - vma->vm_start) return -EINVAL; if (!obj->filp) return -ENODEV; ret = call_mmap(obj->filp, vma); if (ret) return ret; fput(vma->vm_file); vma->vm_file = get_file(obj->filp); vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); return 0; } static void vgem_release(struct drm_device *dev) { struct vgem_device *vgem = container_of(dev, typeof(*vgem), drm); platform_device_unregister(vgem->platform); drm_dev_fini(&vgem->drm); kfree(vgem); } static struct drm_driver vgem_driver = { .driver_features = DRIVER_GEM | DRIVER_PRIME, .release = vgem_release, .open = vgem_open, .postclose = vgem_postclose, .gem_free_object_unlocked = vgem_gem_free_object, .gem_vm_ops = &vgem_gem_vm_ops, .ioctls = vgem_ioctls, .num_ioctls = ARRAY_SIZE(vgem_ioctls), .fops = &vgem_driver_fops, .dumb_create = vgem_gem_dumb_create, .prime_handle_to_fd = drm_gem_prime_handle_to_fd, .prime_fd_to_handle = drm_gem_prime_fd_to_handle, .gem_prime_pin = vgem_prime_pin, .gem_prime_unpin = vgem_prime_unpin, .gem_prime_import = vgem_prime_import, .gem_prime_export = drm_gem_prime_export, .gem_prime_import_sg_table = vgem_prime_import_sg_table, .gem_prime_get_sg_table = vgem_prime_get_sg_table, .gem_prime_vmap = vgem_prime_vmap, .gem_prime_vunmap = vgem_prime_vunmap, .gem_prime_mmap = vgem_prime_mmap, .name = DRIVER_NAME, .desc = DRIVER_DESC, .date = DRIVER_DATE, .major = DRIVER_MAJOR, .minor = DRIVER_MINOR, }; static int __init vgem_init(void) { int ret; vgem_device = kzalloc(sizeof(*vgem_device), GFP_KERNEL); if (!vgem_device) return -ENOMEM; vgem_device->platform = platform_device_register_simple("vgem", -1, NULL, 0); if (IS_ERR(vgem_device->platform)) { ret = PTR_ERR(vgem_device->platform); goto out_free; } dma_coerce_mask_and_coherent(&vgem_device->platform->dev, DMA_BIT_MASK(64)); ret = drm_dev_init(&vgem_device->drm, &vgem_driver, &vgem_device->platform->dev); if (ret) goto out_unregister; /* Final step: expose the device/driver to userspace */ ret = drm_dev_register(&vgem_device->drm, 0); if (ret) goto out_fini; return 0; out_fini: drm_dev_fini(&vgem_device->drm); out_unregister: platform_device_unregister(vgem_device->platform); out_free: kfree(vgem_device); return ret; } static void __exit vgem_exit(void) { drm_dev_unregister(&vgem_device->drm); drm_dev_unref(&vgem_device->drm); } module_init(vgem_init); module_exit(vgem_exit); MODULE_AUTHOR("Red Hat, Inc."); MODULE_AUTHOR("Intel Corporation"); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL and additional rights");
2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 // SPDX-License-Identifier: GPL-2.0 /* * xfrm6_input.c: based on net/ipv4/xfrm4_input.c * * Authors: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * YOSHIFUJI Hideaki @USAGI * IPv6 support */ #include <linux/module.h> #include <linux/string.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <net/ipv6.h> #include <net/xfrm.h> int xfrm6_extract_input(struct xfrm_state *x, struct sk_buff *skb) { return xfrm6_extract_header(skb); } int xfrm6_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi, struct ip6_tnl *t) { XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t; XFRM_SPI_SKB_CB(skb)->family = AF_INET6; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr); return xfrm_input(skb, nexthdr, spi, 0); } EXPORT_SYMBOL(xfrm6_rcv_spi); static int xfrm6_transport_finish2(struct net *net, struct sock *sk, struct sk_buff *skb) { if (xfrm_trans_queue(skb, ip6_rcv_finish)) __kfree_skb(skb); return -1; } int xfrm6_transport_finish(struct sk_buff *skb, int async) { struct xfrm_offload *xo = xfrm_offload(skb); int nhlen = skb->data - skb_network_header(skb); skb_network_header(skb)[IP6CB(skb)->nhoff] = XFRM_MODE_SKB_CB(skb)->protocol; #ifndef CONFIG_NETFILTER if (!async) return 1; #endif __skb_push(skb, nhlen); ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); skb_postpush_rcsum(skb, skb_network_header(skb), nhlen); if (xo && (xo->flags & XFRM_GRO)) { skb_mac_header_rebuild(skb); skb_reset_transport_header(skb); return -1; } NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, dev_net(skb->dev), NULL, skb, skb->dev, NULL, xfrm6_transport_finish2); return -1; } int xfrm6_rcv_tnl(struct sk_buff *skb, struct ip6_tnl *t) { return xfrm6_rcv_spi(skb, skb_network_header(skb)[IP6CB(skb)->nhoff], 0, t); } EXPORT_SYMBOL(xfrm6_rcv_tnl); int xfrm6_rcv(struct sk_buff *skb) { return xfrm6_rcv_tnl(skb, NULL); } EXPORT_SYMBOL(xfrm6_rcv); int xfrm6_input_addr(struct sk_buff *skb, xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto) { struct net *net = dev_net(skb->dev); struct xfrm_state *x = NULL; int i = 0; if (secpath_set(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINERROR); goto drop; } if (1 + skb->sp->len == XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto drop; } for (i = 0; i < 3; i++) { xfrm_address_t *dst, *src; switch (i) { case 0: dst = daddr; src = saddr; break; case 1: /* lookup state with wild-card source address */ dst = daddr; src = (xfrm_address_t *)&in6addr_any; break; default: /* lookup state with wild-card addresses */ dst = (xfrm_address_t *)&in6addr_any; src = (xfrm_address_t *)&in6addr_any; break; } x = xfrm_state_lookup_byaddr(net, skb->mark, dst, src, proto, AF_INET6); if (!x) continue; spin_lock(&x->lock); if ((!i || (x->props.flags & XFRM_STATE_WILDRECV)) && likely(x->km.state == XFRM_STATE_VALID) && !xfrm_state_check_expire(x)) { spin_unlock(&x->lock); if (x->type->input(x, skb) > 0) { /* found a valid state */ break; } } else spin_unlock(&x->lock); xfrm_state_put(x); x = NULL; } if (!x) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES); xfrm_audit_state_notfound_simple(skb, AF_INET6); goto drop; } skb->sp->xvec[skb->sp->len++] = x; spin_lock(&x->lock); x->curlft.bytes += skb->len; x->curlft.packets++; spin_unlock(&x->lock); return 1; drop: return -1; } EXPORT_SYMBOL(xfrm6_input_addr);
1 7 7 3 7 7 7 7 7 1 1 1 7 7 7 7 7 7 7 3 3 7 7 7 7 1 7 7 7 7 7 7 7 7 7 7 6 1 7 7 7 1 7 1 7 7 7 7 7 7 7 7 7 1 1 7 7 7 7 7 7 7 7 7 6 6 1 1 1 7 7 7 7 7 7 7 1 1 1 7 7 7 7 7 7 3 3 3 7 7 7 7 7 7 7 7 7 7 7 7 7 7 1 1 7 7 7 7 7 7 7 7 1 1 7 7 7 7 7 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 /* * net/sched/cls_flower.c Flower classifier * * Copyright (c) 2015 Jiri Pirko <jiri@resnulli.us> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/rhashtable.h> #include <linux/workqueue.h> #include <linux/if_ether.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/mpls.h> #include <net/sch_generic.h> #include <net/pkt_cls.h> #include <net/ip.h> #include <net/flow_dissector.h> #include <net/dst.h> #include <net/dst_metadata.h> struct fl_flow_key { int indev_ifindex; struct flow_dissector_key_control control; struct flow_dissector_key_control enc_control; struct flow_dissector_key_basic basic; struct flow_dissector_key_eth_addrs eth; struct flow_dissector_key_vlan vlan; union { struct flow_dissector_key_ipv4_addrs ipv4; struct flow_dissector_key_ipv6_addrs ipv6; }; struct flow_dissector_key_ports tp; struct flow_dissector_key_icmp icmp; struct flow_dissector_key_arp arp; struct flow_dissector_key_keyid enc_key_id; union { struct flow_dissector_key_ipv4_addrs enc_ipv4; struct flow_dissector_key_ipv6_addrs enc_ipv6; }; struct flow_dissector_key_ports enc_tp; struct flow_dissector_key_mpls mpls; struct flow_dissector_key_tcp tcp; struct flow_dissector_key_ip ip; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { unsigned short int start; unsigned short int end; }; struct fl_flow_mask { struct fl_flow_key key; struct fl_flow_mask_range range; struct rcu_head rcu; }; struct cls_fl_head { struct rhashtable ht; struct fl_flow_mask mask; struct flow_dissector dissector; bool mask_assigned; struct list_head filters; struct rhashtable_params ht_params; union { struct work_struct work; struct rcu_head rcu; }; struct idr handle_idr; }; struct cls_fl_filter { struct rhash_head ht_node; struct fl_flow_key mkey; struct tcf_exts exts; struct tcf_result res; struct fl_flow_key key; struct list_head list; u32 handle; u32 flags; union { struct work_struct work; struct rcu_head rcu; }; struct net_device *hw_dev; }; static unsigned short int fl_mask_range(const struct fl_flow_mask *mask) { return mask->range.end - mask->range.start; } static void fl_mask_update_range(struct fl_flow_mask *mask) { const u8 *bytes = (const u8 *) &mask->key; size_t size = sizeof(mask->key); size_t i, first = 0, last = size - 1; for (i = 0; i < sizeof(mask->key); i++) { if (bytes[i]) { if (!first && i) first = i; last = i; } } mask->range.start = rounddown(first, sizeof(long)); mask->range.end = roundup(last + 1, sizeof(long)); } static void *fl_key_get_start(struct fl_flow_key *key, const struct fl_flow_mask *mask) { return (u8 *) key + mask->range.start; } static void fl_set_masked_key(struct fl_flow_key *mkey, struct fl_flow_key *key, struct fl_flow_mask *mask) { const long *lkey = fl_key_get_start(key, mask); const long *lmask = fl_key_get_start(&mask->key, mask); long *lmkey = fl_key_get_start(mkey, mask); int i; for (i = 0; i < fl_mask_range(mask); i += sizeof(long)) *lmkey++ = *lkey++ & *lmask++; } static void fl_clear_masked_range(struct fl_flow_key *key, struct fl_flow_mask *mask) { memset(fl_key_get_start(key, mask), 0, fl_mask_range(mask)); } static struct cls_fl_filter *fl_lookup(struct cls_fl_head *head, struct fl_flow_key *mkey) { return rhashtable_lookup_fast(&head->ht, fl_key_get_start(mkey, &head->mask), head->ht_params); } static int fl_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); struct cls_fl_filter *f; struct fl_flow_key skb_key; struct fl_flow_key skb_mkey; struct ip_tunnel_info *info; if (!atomic_read(&head->ht.nelems)) return -1; flow_dissector_init_keys(&skb_key.control, &skb_key.basic); fl_clear_masked_range(&skb_key, &head->mask); info = skb_tunnel_info(skb); if (info) { struct ip_tunnel_key *key = &info->key; switch (ip_tunnel_info_af(info)) { case AF_INET: skb_key.enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; skb_key.enc_ipv4.src = key->u.ipv4.src; skb_key.enc_ipv4.dst = key->u.ipv4.dst; break; case AF_INET6: skb_key.enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; skb_key.enc_ipv6.src = key->u.ipv6.src; skb_key.enc_ipv6.dst = key->u.ipv6.dst; break; } skb_key.enc_key_id.keyid = tunnel_id_to_key32(key->tun_id); skb_key.enc_tp.src = key->tp_src; skb_key.enc_tp.dst = key->tp_dst; } skb_key.indev_ifindex = skb->skb_iif; /* skb_flow_dissect() does not set n_proto in case an unknown protocol, * so do it rather here. */ skb_key.basic.n_proto = skb->protocol; skb_flow_dissect(skb, &head->dissector, &skb_key, 0); fl_set_masked_key(&skb_mkey, &skb_key, &head->mask); f = fl_lookup(head, &skb_mkey); if (f && !tc_skip_sw(f->flags)) { *res = f->res; return tcf_exts_exec(skb, &f->exts, res); } return -1; } static int fl_init(struct tcf_proto *tp) { struct cls_fl_head *head; head = kzalloc(sizeof(*head), GFP_KERNEL); if (!head) return -ENOBUFS; INIT_LIST_HEAD_RCU(&head->filters); rcu_assign_pointer(tp->root, head); idr_init(&head->handle_idr); return 0; } static void __fl_destroy_filter(struct cls_fl_filter *f) { tcf_exts_destroy(&f->exts); tcf_exts_put_net(&f->exts); kfree(f); } static void fl_destroy_filter_work(struct work_struct *work) { struct cls_fl_filter *f = container_of(work, struct cls_fl_filter, work); rtnl_lock(); __fl_destroy_filter(f); rtnl_unlock(); } static void fl_destroy_filter(struct rcu_head *head) { struct cls_fl_filter *f = container_of(head, struct cls_fl_filter, rcu); INIT_WORK(&f->work, fl_destroy_filter_work); tcf_queue_work(&f->work); } static void fl_hw_destroy_filter(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; struct net_device *dev = f->hw_dev; if (!tc_can_offload(dev)) return; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_DESTROY; cls_flower.cookie = (unsigned long) f; cls_flower.egress_dev = f->hw_dev != tp->q->dev_queue->dev; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); } static int fl_hw_replace_filter(struct tcf_proto *tp, struct flow_dissector *dissector, struct fl_flow_key *mask, struct cls_fl_filter *f) { struct net_device *dev = tp->q->dev_queue->dev; struct tc_cls_flower_offload cls_flower = {}; int err; if (!tc_can_offload(dev)) { if (tcf_exts_get_dev(dev, &f->exts, &f->hw_dev) || (f->hw_dev && !tc_can_offload(f->hw_dev))) { f->hw_dev = dev; return tc_skip_sw(f->flags) ? -EINVAL : 0; } dev = f->hw_dev; cls_flower.egress_dev = true; } else { f->hw_dev = dev; } tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_REPLACE; cls_flower.cookie = (unsigned long) f; cls_flower.dissector = dissector; cls_flower.mask = mask; cls_flower.key = &f->mkey; cls_flower.exts = &f->exts; err = dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); if (!err) f->flags |= TCA_CLS_FLAGS_IN_HW; if (tc_skip_sw(f->flags)) return err; return 0; } static void fl_hw_update_stats(struct tcf_proto *tp, struct cls_fl_filter *f) { struct tc_cls_flower_offload cls_flower = {}; struct net_device *dev = f->hw_dev; if (!tc_can_offload(dev)) return; tc_cls_common_offload_init(&cls_flower.common, tp); cls_flower.command = TC_CLSFLOWER_STATS; cls_flower.cookie = (unsigned long) f; cls_flower.exts = &f->exts; cls_flower.egress_dev = f->hw_dev != tp->q->dev_queue->dev; dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_CLSFLOWER, &cls_flower); } static void __fl_delete(struct tcf_proto *tp, struct cls_fl_filter *f) { struct cls_fl_head *head = rtnl_dereference(tp->root); idr_remove_ext(&head->handle_idr, f->handle); list_del_rcu(&f->list); if (!tc_skip_hw(f->flags)) fl_hw_destroy_filter(tp, f); tcf_unbind_filter(tp, &f->res); if (tcf_exts_get_net(&f->exts)) call_rcu(&f->rcu, fl_destroy_filter); else __fl_destroy_filter(f); } static void fl_destroy_sleepable(struct work_struct *work) { struct cls_fl_head *head = container_of(work, struct cls_fl_head, work); if (head->mask_assigned) rhashtable_destroy(&head->ht); kfree(head); module_put(THIS_MODULE); } static void fl_destroy_rcu(struct rcu_head *rcu) { struct cls_fl_head *head = container_of(rcu, struct cls_fl_head, rcu); INIT_WORK(&head->work, fl_destroy_sleepable); schedule_work(&head->work); } static void fl_destroy(struct tcf_proto *tp) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f, *next; list_for_each_entry_safe(f, next, &head->filters, list) __fl_delete(tp, f); idr_destroy(&head->handle_idr); __module_get(THIS_MODULE); call_rcu(&head->rcu, fl_destroy_rcu); } static void *fl_get(struct tcf_proto *tp, u32 handle) { struct cls_fl_head *head = rtnl_dereference(tp->root); return idr_find_ext(&head->handle_idr, handle); } static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_UNSPEC] = { .type = NLA_UNSPEC }, [TCA_FLOWER_CLASSID] = { .type = NLA_U32 }, [TCA_FLOWER_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_FLOWER_KEY_ETH_DST] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ETH_DST_MASK] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ETH_SRC] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ETH_SRC_MASK] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ETH_TYPE] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_IP_PROTO] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IPV4_SRC] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_IPV4_SRC_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_IPV4_DST] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_IPV4_DST_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_IPV6_DST] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_TCP_SRC] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_DST] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_SRC] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_DST] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_VLAN_ID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_VLAN_PRIO] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_VLAN_ETH_TYPE] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_KEY_ID] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ENC_IPV4_SRC] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ENC_IPV4_DST] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ENC_IPV4_DST_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ENC_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_ENC_IPV6_DST] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) }, [TCA_FLOWER_KEY_TCP_SRC_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_DST_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_SRC_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_UDP_DST_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_SCTP_SRC_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_SCTP_DST_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_SCTP_SRC] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_SCTP_DST] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_FLAGS] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_FLAGS_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ICMPV4_TYPE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV4_TYPE_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV4_CODE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV4_CODE_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV6_TYPE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV6_TYPE_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV6_CODE] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ICMPV6_CODE_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ARP_SIP] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ARP_SIP_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ARP_TIP] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ARP_TIP_MASK] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_ARP_OP] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ARP_OP_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_ARP_SHA] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_MPLS_TTL] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, [TCA_FLOWER_KEY_TCP_FLAGS] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_TCP_FLAGS_MASK] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_IP_TOS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TOS_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_IP_TTL_MASK] = { .type = NLA_U8 }, [TCA_FLOWER_FLAGS] = { .type = NLA_U32 }, }; static void fl_set_key_val(struct nlattr **tb, void *val, int val_type, void *mask, int mask_type, int len) { if (!tb[val_type]) return; memcpy(val, nla_data(tb[val_type]), len); if (mask_type == TCA_FLOWER_UNSPEC || !tb[mask_type]) memset(mask, 0xff, len); else memcpy(mask, nla_data(tb[mask_type]), len); } static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_val, struct flow_dissector_key_mpls *key_mask) { if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); key_mask->mpls_ttl = MPLS_TTL_MASK; } if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); if (bos & ~MPLS_BOS_MASK) return -EINVAL; key_val->mpls_bos = bos; key_mask->mpls_bos = MPLS_BOS_MASK; } if (tb[TCA_FLOWER_KEY_MPLS_TC]) { u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]); if (tc & ~MPLS_TC_MASK) return -EINVAL; key_val->mpls_tc = tc; key_mask->mpls_tc = MPLS_TC_MASK; } if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]); if (label & ~MPLS_LABEL_MASK) return -EINVAL; key_val->mpls_label = label; key_mask->mpls_label = MPLS_LABEL_MASK; } return 0; } static void fl_set_key_vlan(struct nlattr **tb, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) { #define VLAN_PRIORITY_MASK 0x7 if (tb[TCA_FLOWER_KEY_VLAN_ID]) { key_val->vlan_id = nla_get_u16(tb[TCA_FLOWER_KEY_VLAN_ID]) & VLAN_VID_MASK; key_mask->vlan_id = VLAN_VID_MASK; } if (tb[TCA_FLOWER_KEY_VLAN_PRIO]) { key_val->vlan_priority = nla_get_u8(tb[TCA_FLOWER_KEY_VLAN_PRIO]) & VLAN_PRIORITY_MASK; key_mask->vlan_priority = VLAN_PRIORITY_MASK; } } static void fl_set_key_flag(u32 flower_key, u32 flower_mask, u32 *dissector_key, u32 *dissector_mask, u32 flower_flag_bit, u32 dissector_flag_bit) { if (flower_mask & flower_flag_bit) { *dissector_mask |= dissector_flag_bit; if (flower_key & flower_flag_bit) *dissector_key |= dissector_flag_bit; } } static int fl_set_key_flags(struct nlattr **tb, u32 *flags_key, u32 *flags_mask) { u32 key, mask; /* mask is mandatory for flags */ if (!tb[TCA_FLOWER_KEY_FLAGS_MASK]) return -EINVAL; key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS])); mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK])); *flags_key = 0; *flags_mask = 0; fl_set_key_flag(key, mask, flags_key, flags_mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); return 0; } static void fl_set_key_ip(struct nlattr **tb, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { fl_set_key_val(tb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)); fl_set_key_val(tb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl)); } static int fl_set_key(struct net *net, struct nlattr **tb, struct fl_flow_key *key, struct fl_flow_key *mask) { __be16 ethertype; int ret = 0; #ifdef CONFIG_NET_CLS_IND if (tb[TCA_FLOWER_INDEV]) { int err = tcf_change_indev(net, tb[TCA_FLOWER_INDEV]); if (err < 0) return err; key->indev_ifindex = err; mask->indev_ifindex = 0xffffffff; } #endif fl_set_key_val(tb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)); fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, sizeof(key->eth.src)); if (tb[TCA_FLOWER_KEY_ETH_TYPE]) { ethertype = nla_get_be16(tb[TCA_FLOWER_KEY_ETH_TYPE]); if (ethertype == htons(ETH_P_8021Q)) { fl_set_key_vlan(tb, &key->vlan, &mask->vlan); fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_VLAN_ETH_TYPE, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto)); } else { key->basic.n_proto = ethertype; mask->basic.n_proto = cpu_to_be16(~0); } } if (key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) { fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)); fl_set_key_ip(tb, &key->ip, &mask->ip); } if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) { key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; mask->control.addr_type = ~0; fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, sizeof(key->ipv4.src)); fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, sizeof(key->ipv4.dst)); } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) { key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; mask->control.addr_type = ~0; fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, sizeof(key->ipv6.src)); fl_set_key_val(tb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, sizeof(key->ipv6.dst)); } if (key->basic.ip_proto == IPPROTO_TCP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK, sizeof(key->tp.src)); fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, sizeof(key->tp.dst)); fl_set_key_val(tb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS, &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK, sizeof(key->tcp.flags)); } else if (key->basic.ip_proto == IPPROTO_UDP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK, sizeof(key->tp.src)); fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, sizeof(key->tp.dst)); } else if (key->basic.ip_proto == IPPROTO_SCTP) { fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC, &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK, sizeof(key->tp.src)); fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST, &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, sizeof(key->tp.dst)); } else if (key->basic.n_proto == htons(ETH_P_IP) && key->basic.ip_proto == IPPROTO_ICMP) { fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE_MASK, sizeof(key->icmp.type)); fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE_MASK, sizeof(key->icmp.code)); } else if (key->basic.n_proto == htons(ETH_P_IPV6) && key->basic.ip_proto == IPPROTO_ICMPV6) { fl_set_key_val(tb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, sizeof(key->icmp.type)); fl_set_key_val(tb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE_MASK, sizeof(key->icmp.code)); } else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) || key->basic.n_proto == htons(ETH_P_MPLS_MC)) { ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls); if (ret) return ret; } else if (key->basic.n_proto == htons(ETH_P_ARP) || key->basic.n_proto == htons(ETH_P_RARP)) { fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK, sizeof(key->arp.sip)); fl_set_key_val(tb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK, sizeof(key->arp.tip)); fl_set_key_val(tb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP, &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK, sizeof(key->arp.op)); fl_set_key_val(tb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA, mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK, sizeof(key->arp.sha)); fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA, mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK, sizeof(key->arp.tha)); } if (tb[TCA_FLOWER_KEY_ENC_IPV4_SRC] || tb[TCA_FLOWER_KEY_ENC_IPV4_DST]) { key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; mask->enc_control.addr_type = ~0; fl_set_key_val(tb, &key->enc_ipv4.src, TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK, sizeof(key->enc_ipv4.src)); fl_set_key_val(tb, &key->enc_ipv4.dst, TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst, TCA_FLOWER_KEY_ENC_IPV4_DST_MASK, sizeof(key->enc_ipv4.dst)); } if (tb[TCA_FLOWER_KEY_ENC_IPV6_SRC] || tb[TCA_FLOWER_KEY_ENC_IPV6_DST]) { key->enc_control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; mask->enc_control.addr_type = ~0; fl_set_key_val(tb, &key->enc_ipv6.src, TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src, TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK, sizeof(key->enc_ipv6.src)); fl_set_key_val(tb, &key->enc_ipv6.dst, TCA_FLOWER_KEY_ENC_IPV6_DST, &mask->enc_ipv6.dst, TCA_FLOWER_KEY_ENC_IPV6_DST_MASK, sizeof(key->enc_ipv6.dst)); } fl_set_key_val(tb, &key->enc_key_id.keyid, TCA_FLOWER_KEY_ENC_KEY_ID, &mask->enc_key_id.keyid, TCA_FLOWER_UNSPEC, sizeof(key->enc_key_id.keyid)); fl_set_key_val(tb, &key->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, &mask->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, sizeof(key->enc_tp.src)); fl_set_key_val(tb, &key->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst)); if (tb[TCA_FLOWER_KEY_FLAGS]) ret = fl_set_key_flags(tb, &key->control.flags, &mask->control.flags); return ret; } static bool fl_mask_eq(struct fl_flow_mask *mask1, struct fl_flow_mask *mask2) { const long *lmask1 = fl_key_get_start(&mask1->key, mask1); const long *lmask2 = fl_key_get_start(&mask2->key, mask2); return !memcmp(&mask1->range, &mask2->range, sizeof(mask1->range)) && !memcmp(lmask1, lmask2, fl_mask_range(mask1)); } static const struct rhashtable_params fl_ht_params = { .key_offset = offsetof(struct cls_fl_filter, mkey), /* base offset */ .head_offset = offsetof(struct cls_fl_filter, ht_node), .automatic_shrinking = true, }; static int fl_init_hashtable(struct cls_fl_head *head, struct fl_flow_mask *mask) { head->ht_params = fl_ht_params; head->ht_params.key_len = fl_mask_range(mask); head->ht_params.key_offset += mask->range.start; return rhashtable_init(&head->ht, &head->ht_params); } #define FL_KEY_MEMBER_OFFSET(member) offsetof(struct fl_flow_key, member) #define FL_KEY_MEMBER_SIZE(member) (sizeof(((struct fl_flow_key *) 0)->member)) #define FL_KEY_IS_MASKED(mask, member) \ memchr_inv(((char *)mask) + FL_KEY_MEMBER_OFFSET(member), \ 0, FL_KEY_MEMBER_SIZE(member)) \ #define FL_KEY_SET(keys, cnt, id, member) \ do { \ keys[cnt].key_id = id; \ keys[cnt].offset = FL_KEY_MEMBER_OFFSET(member); \ cnt++; \ } while(0); #define FL_KEY_SET_IF_MASKED(mask, keys, cnt, id, member) \ do { \ if (FL_KEY_IS_MASKED(mask, member)) \ FL_KEY_SET(keys, cnt, id, member); \ } while(0); static void fl_init_dissector(struct cls_fl_head *head, struct fl_flow_mask *mask) { struct flow_dissector_key keys[FLOW_DISSECTOR_KEY_MAX]; size_t cnt = 0; FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_CONTROL, control); FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_BASIC, basic); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ETH_ADDRS, eth); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_IPV4_ADDRS, ipv4); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_IPV6_ADDRS, ipv6); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_PORTS, tp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_IP, ip); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_TCP, tcp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ICMP, icmp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ARP, arp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_MPLS, mpls); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_KEYID, enc_key_id); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, enc_ipv4); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, enc_ipv6); if (FL_KEY_IS_MASKED(&mask->key, enc_ipv4) || FL_KEY_IS_MASKED(&mask->key, enc_ipv6)) FL_KEY_SET(keys, cnt, FLOW_DISSECTOR_KEY_ENC_CONTROL, enc_control); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ENC_PORTS, enc_tp); skb_flow_dissector_init(&head->dissector, keys, cnt); } static int fl_check_assign_mask(struct cls_fl_head *head, struct fl_flow_mask *mask) { int err; if (head->mask_assigned) { if (!fl_mask_eq(&head->mask, mask)) return -EINVAL; else return 0; } /* Mask is not assigned yet. So assign it and init hashtable * according to that. */ err = fl_init_hashtable(head, mask); if (err) return err; memcpy(&head->mask, mask, sizeof(head->mask)); head->mask_assigned = true; fl_init_dissector(head, mask); return 0; } static int fl_set_parms(struct net *net, struct tcf_proto *tp, struct cls_fl_filter *f, struct fl_flow_mask *mask, unsigned long base, struct nlattr **tb, struct nlattr *est, bool ovr) { int err; err = tcf_exts_validate(net, tp, tb, est, &f->exts, ovr); if (err < 0) return err; if (tb[TCA_FLOWER_CLASSID]) { f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); tcf_bind_filter(tp, &f->res, base); } err = fl_set_key(net, tb, &f->key, &mask->key); if (err) return err; fl_mask_update_range(mask); fl_set_masked_key(&f->mkey, &f->key, mask); return 0; } static int fl_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, bool ovr) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *fold = *arg; struct cls_fl_filter *fnew; struct nlattr **tb; struct fl_flow_mask mask = {}; unsigned long idr_index; int err; if (!tca[TCA_OPTIONS]) return -EINVAL; tb = kcalloc(TCA_FLOWER_MAX + 1, sizeof(struct nlattr *), GFP_KERNEL); if (!tb) return -ENOBUFS; err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy, NULL); if (err < 0) goto errout_tb; if (fold && handle && fold->handle != handle) { err = -EINVAL; goto errout_tb; } fnew = kzalloc(sizeof(*fnew), GFP_KERNEL); if (!fnew) { err = -ENOBUFS; goto errout_tb; } err = tcf_exts_init(&fnew->exts, TCA_FLOWER_ACT, 0); if (err < 0) goto errout; if (!handle) { err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, 1, 0x80000000, GFP_KERNEL); if (err) goto errout; fnew->handle = idr_index; } /* user specifies a handle and it doesn't exist */ if (handle && !fold) { err = idr_alloc_ext(&head->handle_idr, fnew, &idr_index, handle, handle + 1, GFP_KERNEL); if (err) goto errout; fnew->handle = idr_index; } if (tb[TCA_FLOWER_FLAGS]) { fnew->flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]); if (!tc_flags_valid(fnew->flags)) { err = -EINVAL; goto errout_idr; } } err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr); if (err) goto errout_idr; err = fl_check_assign_mask(head, &mask); if (err) goto errout_idr; if (!tc_skip_sw(fnew->flags)) { if (!fold && fl_lookup(head, &fnew->mkey)) { err = -EEXIST; goto errout_idr; } err = rhashtable_insert_fast(&head->ht, &fnew->ht_node, head->ht_params); if (err) goto errout_idr; } if (!tc_skip_hw(fnew->flags)) { err = fl_hw_replace_filter(tp, &head->dissector, &mask.key, fnew); if (err) goto errout_idr; } if (!tc_in_hw(fnew->flags)) fnew->flags |= TCA_CLS_FLAGS_NOT_IN_HW; if (fold) { if (!tc_skip_sw(fold->flags)) rhashtable_remove_fast(&head->ht, &fold->ht_node, head->ht_params); if (!tc_skip_hw(fold->flags)) fl_hw_destroy_filter(tp, fold); } *arg = fnew; if (fold) { fnew->handle = handle; idr_replace_ext(&head->handle_idr, fnew, fnew->handle); list_replace_rcu(&fold->list, &fnew->list); tcf_unbind_filter(tp, &fold->res); tcf_exts_get_net(&fold->exts); call_rcu(&fold->rcu, fl_destroy_filter); } else { list_add_tail_rcu(&fnew->list, &head->filters); } kfree(tb); return 0; errout_idr: if (!fold) idr_remove_ext(&head->handle_idr, fnew->handle); errout: tcf_exts_destroy(&fnew->exts); kfree(fnew); errout_tb: kfree(tb); return err; } static int fl_delete(struct tcf_proto *tp, void *arg, bool *last) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = arg; if (!tc_skip_sw(f->flags)) rhashtable_remove_fast(&head->ht, &f->ht_node, head->ht_params); __fl_delete(tp, f); *last = list_empty(&head->filters); return 0; } static void fl_walk(struct tcf_proto *tp, struct tcf_walker *arg) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f; list_for_each_entry_rcu(f, &head->filters, list) { if (arg->count < arg->skip) goto skip; if (arg->fn(tp, f, arg) < 0) { arg->stop = 1; break; } skip: arg->count++; } } static int fl_dump_key_val(struct sk_buff *skb, void *val, int val_type, void *mask, int mask_type, int len) { int err; if (!memchr_inv(mask, 0, len)) return 0; err = nla_put(skb, val_type, len, val); if (err) return err; if (mask_type != TCA_FLOWER_UNSPEC) { err = nla_put(skb, mask_type, len, mask); if (err) return err; } return 0; } static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) { int err; if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) return 0; if (mpls_mask->mpls_ttl) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, mpls_key->mpls_ttl); if (err) return err; } if (mpls_mask->mpls_tc) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC, mpls_key->mpls_tc); if (err) return err; } if (mpls_mask->mpls_label) { err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL, mpls_key->mpls_label); if (err) return err; } if (mpls_mask->mpls_bos) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS, mpls_key->mpls_bos); if (err) return err; } return 0; } static int fl_dump_key_ip(struct sk_buff *skb, struct flow_dissector_key_ip *key, struct flow_dissector_key_ip *mask) { if (fl_dump_key_val(skb, &key->tos, TCA_FLOWER_KEY_IP_TOS, &mask->tos, TCA_FLOWER_KEY_IP_TOS_MASK, sizeof(key->tos)) || fl_dump_key_val(skb, &key->ttl, TCA_FLOWER_KEY_IP_TTL, &mask->ttl, TCA_FLOWER_KEY_IP_TTL_MASK, sizeof(key->ttl))) return -1; return 0; } static int fl_dump_key_vlan(struct sk_buff *skb, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) { int err; if (!memchr_inv(vlan_mask, 0, sizeof(*vlan_mask))) return 0; if (vlan_mask->vlan_id) { err = nla_put_u16(skb, TCA_FLOWER_KEY_VLAN_ID, vlan_key->vlan_id); if (err) return err; } if (vlan_mask->vlan_priority) { err = nla_put_u8(skb, TCA_FLOWER_KEY_VLAN_PRIO, vlan_key->vlan_priority); if (err) return err; } return 0; } static void fl_get_key_flag(u32 dissector_key, u32 dissector_mask, u32 *flower_key, u32 *flower_mask, u32 flower_flag_bit, u32 dissector_flag_bit) { if (dissector_mask & dissector_flag_bit) { *flower_mask |= flower_flag_bit; if (dissector_key & dissector_flag_bit) *flower_key |= flower_flag_bit; } } static int fl_dump_key_flags(struct sk_buff *skb, u32 flags_key, u32 flags_mask) { u32 key, mask; __be32 _key, _mask; int err; if (!memchr_inv(&flags_mask, 0, sizeof(flags_mask))) return 0; key = 0; mask = 0; fl_get_key_flag(flags_key, flags_mask, &key, &mask, TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT, FLOW_DIS_IS_FRAGMENT); _key = cpu_to_be32(key); _mask = cpu_to_be32(mask); err = nla_put(skb, TCA_FLOWER_KEY_FLAGS, 4, &_key); if (err) return err; return nla_put(skb, TCA_FLOWER_KEY_FLAGS_MASK, 4, &_mask); } static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = fh; struct nlattr *nest; struct fl_flow_key *key, *mask; if (!f) return skb->len; t->tcm_handle = f->handle; nest = nla_nest_start(skb, TCA_OPTIONS); if (!nest) goto nla_put_failure; if (f->res.classid && nla_put_u32(skb, TCA_FLOWER_CLASSID, f->res.classid)) goto nla_put_failure; key = &f->key; mask = &head->mask.key; if (mask->indev_ifindex) { struct net_device *dev; dev = __dev_get_by_index(net, key->indev_ifindex); if (dev && nla_put_string(skb, TCA_FLOWER_INDEV, dev->name)) goto nla_put_failure; } if (!tc_skip_hw(f->flags)) fl_hw_update_stats(tp, f); if (fl_dump_key_val(skb, key->eth.dst, TCA_FLOWER_KEY_ETH_DST, mask->eth.dst, TCA_FLOWER_KEY_ETH_DST_MASK, sizeof(key->eth.dst)) || fl_dump_key_val(skb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC, mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK, sizeof(key->eth.src)) || fl_dump_key_val(skb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE, &mask->basic.n_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.n_proto))) goto nla_put_failure; if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls)) goto nla_put_failure; if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) goto nla_put_failure; if ((key->basic.n_proto == htons(ETH_P_IP) || key->basic.n_proto == htons(ETH_P_IPV6)) && (fl_dump_key_val(skb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO, &mask->basic.ip_proto, TCA_FLOWER_UNSPEC, sizeof(key->basic.ip_proto)) || fl_dump_key_ip(skb, &key->ip, &mask->ip))) goto nla_put_failure; if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC, &mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK, sizeof(key->ipv4.src)) || fl_dump_key_val(skb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST, &mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK, sizeof(key->ipv4.dst)))) goto nla_put_failure; else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && (fl_dump_key_val(skb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC, &mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK, sizeof(key->ipv6.src)) || fl_dump_key_val(skb, &key->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST, &mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK, sizeof(key->ipv6.dst)))) goto nla_put_failure; if (key->basic.ip_proto == IPPROTO_TCP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC, &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST, &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK, sizeof(key->tp.dst)) || fl_dump_key_val(skb, &key->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS, &mask->tcp.flags, TCA_FLOWER_KEY_TCP_FLAGS_MASK, sizeof(key->tcp.flags)))) goto nla_put_failure; else if (key->basic.ip_proto == IPPROTO_UDP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC, &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST, &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; else if (key->basic.ip_proto == IPPROTO_SCTP && (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_SCTP_SRC, &mask->tp.src, TCA_FLOWER_KEY_SCTP_SRC_MASK, sizeof(key->tp.src)) || fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_SCTP_DST, &mask->tp.dst, TCA_FLOWER_KEY_SCTP_DST_MASK, sizeof(key->tp.dst)))) goto nla_put_failure; else if (key->basic.n_proto == htons(ETH_P_IP) && key->basic.ip_proto == IPPROTO_ICMP && (fl_dump_key_val(skb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE, &mask->icmp.type, TCA_FLOWER_KEY_ICMPV4_TYPE_MASK, sizeof(key->icmp.type)) || fl_dump_key_val(skb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE, &mask->icmp.code, TCA_FLOWER_KEY_ICMPV4_CODE_MASK, sizeof(key->icmp.code)))) goto nla_put_failure; else if (key->basic.n_proto == htons(ETH_P_IPV6) && key->basic.ip_proto == IPPROTO_ICMPV6 && (fl_dump_key_val(skb, &key->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE, &mask->icmp.type, TCA_FLOWER_KEY_ICMPV6_TYPE_MASK, sizeof(key->icmp.type)) || fl_dump_key_val(skb, &key->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE, &mask->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE_MASK, sizeof(key->icmp.code)))) goto nla_put_failure; else if ((key->basic.n_proto == htons(ETH_P_ARP) || key->basic.n_proto == htons(ETH_P_RARP)) && (fl_dump_key_val(skb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP, &mask->arp.sip, TCA_FLOWER_KEY_ARP_SIP_MASK, sizeof(key->arp.sip)) || fl_dump_key_val(skb, &key->arp.tip, TCA_FLOWER_KEY_ARP_TIP, &mask->arp.tip, TCA_FLOWER_KEY_ARP_TIP_MASK, sizeof(key->arp.tip)) || fl_dump_key_val(skb, &key->arp.op, TCA_FLOWER_KEY_ARP_OP, &mask->arp.op, TCA_FLOWER_KEY_ARP_OP_MASK, sizeof(key->arp.op)) || fl_dump_key_val(skb, key->arp.sha, TCA_FLOWER_KEY_ARP_SHA, mask->arp.sha, TCA_FLOWER_KEY_ARP_SHA_MASK, sizeof(key->arp.sha)) || fl_dump_key_val(skb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA, mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK, sizeof(key->arp.tha)))) goto nla_put_failure; if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv4.src, TCA_FLOWER_KEY_ENC_IPV4_SRC, &mask->enc_ipv4.src, TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK, sizeof(key->enc_ipv4.src)) || fl_dump_key_val(skb, &key->enc_ipv4.dst, TCA_FLOWER_KEY_ENC_IPV4_DST, &mask->enc_ipv4.dst, TCA_FLOWER_KEY_ENC_IPV4_DST_MASK, sizeof(key->enc_ipv4.dst)))) goto nla_put_failure; else if (key->enc_control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS && (fl_dump_key_val(skb, &key->enc_ipv6.src, TCA_FLOWER_KEY_ENC_IPV6_SRC, &mask->enc_ipv6.src, TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK, sizeof(key->enc_ipv6.src)) || fl_dump_key_val(skb, &key->enc_ipv6.dst, TCA_FLOWER_KEY_ENC_IPV6_DST, &mask->enc_ipv6.dst, TCA_FLOWER_KEY_ENC_IPV6_DST_MASK, sizeof(key->enc_ipv6.dst)))) goto nla_put_failure; if (fl_dump_key_val(skb, &key->enc_key_id, TCA_FLOWER_KEY_ENC_KEY_ID, &mask->enc_key_id, TCA_FLOWER_UNSPEC, sizeof(key->enc_key_id)) || fl_dump_key_val(skb, &key->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, &mask->enc_tp.src, TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, sizeof(key->enc_tp.src)) || fl_dump_key_val(skb, &key->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT, &mask->enc_tp.dst, TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, sizeof(key->enc_tp.dst))) goto nla_put_failure; if (fl_dump_key_flags(skb, key->control.flags, mask->control.flags)) goto nla_put_failure; if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) goto nla_put_failure; if (tcf_exts_dump(skb, &f->exts)) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &f->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void fl_bind_class(void *fh, u32 classid, unsigned long cl) { struct cls_fl_filter *f = fh; if (f && f->res.classid == classid) f->res.class = cl; } static struct tcf_proto_ops cls_fl_ops __read_mostly = { .kind = "flower", .classify = fl_classify, .init = fl_init, .destroy = fl_destroy, .get = fl_get, .change = fl_change, .delete = fl_delete, .walk = fl_walk, .dump = fl_dump, .bind_class = fl_bind_class, .owner = THIS_MODULE, }; static int __init cls_fl_init(void) { return register_tcf_proto_ops(&cls_fl_ops); } static void __exit cls_fl_exit(void) { unregister_tcf_proto_ops(&cls_fl_ops); } module_init(cls_fl_init); module_exit(cls_fl_exit); MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>"); MODULE_DESCRIPTION("Flower classifier"); MODULE_LICENSE("GPL v2");
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 /* * IPV4 GSO/GRO offload support * Linux INET implementation * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * GRE GSO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <net/gre.h> static struct sk_buff *gre_gso_segment(struct sk_buff *skb, netdev_features_t features) { int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb); bool need_csum, need_recompute_csum, gso_partial; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 protocol = skb->protocol; u16 mac_len = skb->mac_len; int gre_offset, outer_hlen; if (!skb->encapsulation) goto out; if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr))) goto out; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; /* setup inner skb. */ skb->encapsulation = 0; SKB_GSO_CB(skb)->encap_level = 0; __skb_pull(skb, tnl_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, skb_inner_network_offset(skb)); skb->mac_len = skb_inner_network_offset(skb); skb->protocol = skb->inner_protocol; need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM); need_recompute_csum = skb->csum_not_inet; skb->encap_hdr_csum = need_csum; features &= skb->dev->hw_enc_features; /* segment inner packet. */ segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset, mac_len); goto out; } gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); outer_hlen = skb_tnl_header_len(skb); gre_offset = outer_hlen - tnl_hlen; skb = segs; do { struct gre_base_hdr *greh; __sum16 *pcsum; /* Set up inner headers if we are offloading inner checksum */ if (skb->ip_summed == CHECKSUM_PARTIAL) { skb_reset_inner_headers(skb); skb->encapsulation = 1; } skb->mac_len = mac_len; skb->protocol = protocol; __skb_push(skb, outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, mac_len); skb_set_transport_header(skb, gre_offset); if (!need_csum) continue; greh = (struct gre_base_hdr *)skb_transport_header(skb); pcsum = (__sum16 *)(greh + 1); if (gso_partial && skb_is_gso(skb)) { unsigned int partial_adj; /* Adjust checksum to account for the fact that * the partial checksum is based on actual size * whereas headers should be based on MSS size. */ partial_adj = skb->len + skb_headroom(skb) - SKB_GSO_CB(skb)->data_offset - skb_shinfo(skb)->gso_size; *pcsum = ~csum_fold((__force __wsum)htonl(partial_adj)); } else { *pcsum = 0; } *(pcsum + 1) = 0; if (need_recompute_csum && !skb_is_gso(skb)) { __wsum csum; csum = skb_checksum(skb, gre_offset, skb->len - gre_offset, 0); *pcsum = csum_fold(csum); } else { *pcsum = gso_make_checksum(skb, 0); } } while ((skb = skb->next)); out: return segs; } static struct sk_buff **gre_gro_receive(struct sk_buff **head, struct sk_buff *skb) { struct sk_buff **pp = NULL; struct sk_buff *p; const struct gre_base_hdr *greh; unsigned int hlen, grehlen; unsigned int off; int flush = 1; struct packet_offload *ptype; __be16 type; if (NAPI_GRO_CB(skb)->encap_mark) goto out; NAPI_GRO_CB(skb)->encap_mark = 1; off = skb_gro_offset(skb); hlen = off + sizeof(*greh); greh = skb_gro_header_fast(skb, off); if (skb_gro_header_hard(skb, hlen)) { greh = skb_gro_header_slow(skb, hlen, off); if (unlikely(!greh)) goto out; } /* Only support version 0 and K (key), C (csum) flags. Note that * although the support for the S (seq#) flag can be added easily * for GRO, this is problematic for GSO hence can not be enabled * here because a GRO pkt may end up in the forwarding path, thus * requiring GSO support to break it up correctly. */ if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0) goto out; /* We can only support GRE_CSUM if we can track the location of * the GRE header. In the case of FOU/GUE we cannot because the * outer UDP header displaces the GRE header leaving us in a state * of limbo. */ if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou) goto out; type = greh->protocol; rcu_read_lock(); ptype = gro_find_receive_by_type(type); if (!ptype) goto out_unlock; grehlen = GRE_HEADER_SECTION; if (greh->flags & GRE_KEY) grehlen += GRE_HEADER_SECTION; if (greh->flags & GRE_CSUM) grehlen += GRE_HEADER_SECTION; hlen = off + grehlen; if (skb_gro_header_hard(skb, hlen)) { greh = skb_gro_header_slow(skb, hlen, off); if (unlikely(!greh)) goto out_unlock; } /* Don't bother verifying checksum if we're going to flush anyway. */ if ((greh->flags & GRE_CSUM) && !NAPI_GRO_CB(skb)->flush) { if (skb_gro_checksum_simple_validate(skb)) goto out_unlock; skb_gro_checksum_try_convert(skb, IPPROTO_GRE, 0, null_compute_pseudo); } for (p = *head; p; p = p->next) { const struct gre_base_hdr *greh2; if (!NAPI_GRO_CB(p)->same_flow) continue; /* The following checks are needed to ensure only pkts * from the same tunnel are considered for aggregation. * The criteria for "the same tunnel" includes: * 1) same version (we only support version 0 here) * 2) same protocol (we only support ETH_P_IP for now) * 3) same set of flags * 4) same key if the key field is present. */ greh2 = (struct gre_base_hdr *)(p->data + off); if (greh2->flags != greh->flags || greh2->protocol != greh->protocol) { NAPI_GRO_CB(p)->same_flow = 0; continue; } if (greh->flags & GRE_KEY) { /* compare keys */ if (*(__be32 *)(greh2+1) != *(__be32 *)(greh+1)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } } } skb_gro_pull(skb, grehlen); /* Adjusted NAPI_GRO_CB(skb)->csum after skb_gro_pull()*/ skb_gro_postpull_rcsum(skb, greh, grehlen); pp = call_gro_receive(ptype->callbacks.gro_receive, head, skb); flush = 0; out_unlock: rcu_read_unlock(); out: skb_gro_flush_final(skb, pp, flush); return pp; } static int gre_gro_complete(struct sk_buff *skb, int nhoff) { struct gre_base_hdr *greh = (struct gre_base_hdr *)(skb->data + nhoff); struct packet_offload *ptype; unsigned int grehlen = sizeof(*greh); int err = -ENOENT; __be16 type; skb->encapsulation = 1; skb_shinfo(skb)->gso_type = SKB_GSO_GRE; type = greh->protocol; if (greh->flags & GRE_KEY) grehlen += GRE_HEADER_SECTION; if (greh->flags & GRE_CSUM) grehlen += GRE_HEADER_SECTION; rcu_read_lock(); ptype = gro_find_complete_by_type(type); if (ptype) err = ptype->callbacks.gro_complete(skb, nhoff + grehlen); rcu_read_unlock(); skb_set_inner_mac_header(skb, nhoff + grehlen); return err; } static const struct net_offload gre_offload = { .callbacks = { .gso_segment = gre_gso_segment, .gro_receive = gre_gro_receive, .gro_complete = gre_gro_complete, }, }; static int __init gre_offload_init(void) { int err; err = inet_add_offload(&gre_offload, IPPROTO_GRE); #if IS_ENABLED(CONFIG_IPV6) if (err) return err; err = inet6_add_offload(&gre_offload, IPPROTO_GRE); if (err) inet_del_offload(&gre_offload, IPPROTO_GRE); #endif return err; } device_initcall(gre_offload_init);
2 2 3 27 26 26 20 105 2 2 2 2 105 2 2 4 4 3 4 4 20 24 77 74 2 26 26 2 1 24 26 6 4 4 3 3 2 2 516 81 79 81 2 514 20 20 84 22 20 19 11 11 11 20 20 20 64 67 1 1 2 2 2 2 528 527 1785 1443 528 527 517 513 514 1772 83 2 83 81 81 77 3 81 81 2 2 81 78 3 3 81 81 3 81 3 81 80 78 77 3 2 2 2 2 2 2 2 3 2 2 2 7 78 3 1 2 3 2 1 1 2 2 1 2 2 3 3 2 1 2 1 2 2 1940 87 6 1782 7 7 3 78 78 2 2 2 3 494 666 1920 2428 2425 2428 385 12 378 56 380 1883 1884 1166 1166 1166 1166 1166 43 43 43 43 43 43 43 43 43 43 31 10 10 10 170 10 10 147 170 174 47 43 146 144 2 143 163 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 /* * linux/drivers/block/loop.c * * Written by Theodore Ts'o, 3/29/93 * * Copyright 1993 by Theodore Ts'o. Redistribution of this file is * permitted under the GNU General Public License. * * DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993 * more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996 * * Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994 * Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996 * * Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997 * * Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998 * * Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998 * * Loadable modules and other fixes by AK, 1998 * * Make real block number available to downstream transfer functions, enables * CBC (and relatives) mode encryption requiring unique IVs per data block. * Reed H. Petty, rhp@draper.net * * Maximum number of loop devices now dynamic via max_loop module parameter. * Russell Kroll <rkroll@exploits.org> 19990701 * * Maximum number of loop devices when compiled-in now selectable by passing * max_loop=<1-255> to the kernel on boot. * Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999 * * Completely rewrite request handling to be make_request_fn style and * non blocking, pushing work to a helper thread. Lots of fixes from * Al Viro too. * Jens Axboe <axboe@suse.de>, Nov 2000 * * Support up to 256 loop devices * Heinz Mauelshagen <mge@sistina.com>, Feb 2002 * * Support for falling back on the write file operation when the address space * operations write_begin is not available on the backing filesystem. * Anton Altaparmakov, 16 Feb 2005 * * Still To Fix: * - Advisory locking is ignored here. * - Should use an own CAP_* category instead of CAP_SYS_ADMIN * */ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/major.h> #include <linux/wait.h> #include <linux/blkdev.h> #include <linux/blkpg.h> #include <linux/init.h> #include <linux/swap.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/suspend.h> #include <linux/freezer.h> #include <linux/mutex.h> #include <linux/writeback.h> #include <linux/completion.h> #include <linux/highmem.h> #include <linux/kthread.h> #include <linux/splice.h> #include <linux/sysfs.h> #include <linux/miscdevice.h> #include <linux/falloc.h> #include <linux/uio.h> #include "loop.h" #include <linux/uaccess.h> static DEFINE_IDR(loop_index_idr); static DEFINE_MUTEX(loop_index_mutex); static int max_part; static int part_shift; static int transfer_xor(struct loop_device *lo, int cmd, struct page *raw_page, unsigned raw_off, struct page *loop_page, unsigned loop_off, int size, sector_t real_block) { char *raw_buf = kmap_atomic(raw_page) + raw_off; char *loop_buf = kmap_atomic(loop_page) + loop_off; char *in, *out, *key; int i, keysize; if (cmd == READ) { in = raw_buf; out = loop_buf; } else { in = loop_buf; out = raw_buf; } key = lo->lo_encrypt_key; keysize = lo->lo_encrypt_key_size; for (i = 0; i < size; i++) *out++ = *in++ ^ key[(i & 511) % keysize]; kunmap_atomic(loop_buf); kunmap_atomic(raw_buf); cond_resched(); return 0; } static int xor_init(struct loop_device *lo, const struct loop_info64 *info) { if (unlikely(info->lo_encrypt_key_size <= 0)) return -EINVAL; return 0; } static struct loop_func_table none_funcs = { .number = LO_CRYPT_NONE, }; static struct loop_func_table xor_funcs = { .number = LO_CRYPT_XOR, .transfer = transfer_xor, .init = xor_init }; /* xfer_funcs[0] is special - its release function is never called */ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { &none_funcs, &xor_funcs }; static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file) { loff_t loopsize; /* Compute loopsize in bytes */ loopsize = i_size_read(file->f_mapping->host); if (offset > 0) loopsize -= offset; /* offset is beyond i_size, weird but possible */ if (loopsize < 0) return 0; if (sizelimit > 0 && sizelimit < loopsize) loopsize = sizelimit; /* * Unfortunately, if we want to do I/O on the device, * the number of 512-byte sectors has to fit into a sector_t. */ return loopsize >> 9; } static loff_t get_loop_size(struct loop_device *lo, struct file *file) { return get_size(lo->lo_offset, lo->lo_sizelimit, file); } static void __loop_update_dio(struct loop_device *lo, bool dio) { struct file *file = lo->lo_backing_file; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; unsigned short sb_bsize = 0; unsigned dio_align = 0; bool use_dio; if (inode->i_sb->s_bdev) { sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev); dio_align = sb_bsize - 1; } /* * We support direct I/O only if lo_offset is aligned with the * logical I/O size of backing device, and the logical block * size of loop is bigger than the backing device's and the loop * needn't transform transfer. * * TODO: the above condition may be loosed in the future, and * direct I/O may be switched runtime at that time because most * of requests in sane applications should be PAGE_SIZE aligned */ if (dio) { if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && !(lo->lo_offset & dio_align) && mapping->a_ops->direct_IO && !lo->transfer) use_dio = true; else use_dio = false; } else { use_dio = false; } if (lo->use_dio == use_dio) return; /* flush dirty pages before changing direct IO */ vfs_fsync(file, 0); /* * The flag of LO_FLAGS_DIRECT_IO is handled similarly with * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup * will get updated by ioctl(LOOP_GET_STATUS) */ blk_mq_freeze_queue(lo->lo_queue); lo->use_dio = use_dio; if (use_dio) { queue_flag_clear_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); lo->lo_flags |= LO_FLAGS_DIRECT_IO; } else { queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; } blk_mq_unfreeze_queue(lo->lo_queue); } static int figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) { loff_t size = get_size(offset, sizelimit, lo->lo_backing_file); sector_t x = (sector_t)size; struct block_device *bdev = lo->lo_device; if (unlikely((loff_t)x != size)) return -EFBIG; if (lo->lo_offset != offset) lo->lo_offset = offset; if (lo->lo_sizelimit != sizelimit) lo->lo_sizelimit = sizelimit; set_capacity(lo->lo_disk, x); bd_set_size(bdev, (loff_t)get_capacity(bdev->bd_disk) << 9); /* let user-space know about the new size */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); return 0; } static inline int lo_do_transfer(struct loop_device *lo, int cmd, struct page *rpage, unsigned roffs, struct page *lpage, unsigned loffs, int size, sector_t rblock) { int ret; ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); if (likely(!ret)) return 0; printk_ratelimited(KERN_ERR "loop: Transfer error at byte offset %llu, length %i.\n", (unsigned long long)rblock << 9, size); return ret; } static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) { struct iov_iter i; ssize_t bw; iov_iter_bvec(&i, ITER_BVEC | WRITE, bvec, 1, bvec->bv_len); file_start_write(file); bw = vfs_iter_write(file, &i, ppos, 0); file_end_write(file); if (likely(bw == bvec->bv_len)) return 0; printk_ratelimited(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", (unsigned long long)*ppos, bvec->bv_len); if (bw >= 0) bw = -EIO; return bw; } static int lo_write_simple(struct loop_device *lo, struct request *rq, loff_t pos) { struct bio_vec bvec; struct req_iterator iter; int ret = 0; rq_for_each_segment(bvec, rq, iter) { ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); if (ret < 0) break; cond_resched(); } return ret; } /* * This is the slow, transforming version that needs to double buffer the * data as it cannot do the transformations in place without having direct * access to the destination pages of the backing file. */ static int lo_write_transfer(struct loop_device *lo, struct request *rq, loff_t pos) { struct bio_vec bvec, b; struct req_iterator iter; struct page *page; int ret = 0; page = alloc_page(GFP_NOIO); if (unlikely(!page)) return -ENOMEM; rq_for_each_segment(bvec, rq, iter) { ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page, bvec.bv_offset, bvec.bv_len, pos >> 9); if (unlikely(ret)) break; b.bv_page = page; b.bv_offset = 0; b.bv_len = bvec.bv_len; ret = lo_write_bvec(lo->lo_backing_file, &b, &pos); if (ret < 0) break; } __free_page(page); return ret; } static int lo_read_simple(struct loop_device *lo, struct request *rq, loff_t pos) { struct bio_vec bvec; struct req_iterator iter; struct iov_iter i; ssize_t len; rq_for_each_segment(bvec, rq, iter) { iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) return len; flush_dcache_page(bvec.bv_page); if (len != bvec.bv_len) { struct bio *bio; __rq_for_each_bio(bio, rq) zero_fill_bio(bio); break; } cond_resched(); } return 0; } static int lo_read_transfer(struct loop_device *lo, struct request *rq, loff_t pos) { struct bio_vec bvec, b; struct req_iterator iter; struct iov_iter i; struct page *page; ssize_t len; int ret = 0; page = alloc_page(GFP_NOIO); if (unlikely(!page)) return -ENOMEM; rq_for_each_segment(bvec, rq, iter) { loff_t offset = pos; b.bv_page = page; b.bv_offset = 0; b.bv_len = bvec.bv_len; iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len); len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0); if (len < 0) { ret = len; goto out_free_page; } ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page, bvec.bv_offset, len, offset >> 9); if (ret) goto out_free_page; flush_dcache_page(bvec.bv_page); if (len != bvec.bv_len) { struct bio *bio; __rq_for_each_bio(bio, rq) zero_fill_bio(bio); break; } } ret = 0; out_free_page: __free_page(page); return ret; } static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos, int mode) { /* * We use fallocate to manipulate the space mappings used by the image * a.k.a. discard/zerorange. However we do not support this if * encryption is enabled, because it may give an attacker useful * information. */ struct file *file = lo->lo_backing_file; int ret; mode |= FALLOC_FL_KEEP_SIZE; if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { ret = -EOPNOTSUPP; goto out; } ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) ret = -EIO; out: return ret; } static int lo_req_flush(struct loop_device *lo, struct request *rq) { struct file *file = lo->lo_backing_file; int ret = vfs_fsync(file, 0); if (unlikely(ret && ret != -EINVAL)) ret = -EIO; return ret; } static void lo_complete_rq(struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); if (unlikely(req_op(cmd->rq) == REQ_OP_READ && cmd->use_aio && cmd->ret >= 0 && cmd->ret < blk_rq_bytes(cmd->rq))) { struct bio *bio = cmd->rq->bio; bio_advance(bio, cmd->ret); zero_fill_bio(bio); } blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK); } static void lo_rw_aio_do_completion(struct loop_cmd *cmd) { if (!atomic_dec_and_test(&cmd->ref)) return; kfree(cmd->bvec); cmd->bvec = NULL; blk_mq_complete_request(cmd->rq); } static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); cmd->ret = ret; lo_rw_aio_do_completion(cmd); } static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, loff_t pos, bool rw) { struct iov_iter iter; struct bio_vec *bvec; struct request *rq = cmd->rq; struct bio *bio = rq->bio; struct file *file = lo->lo_backing_file; unsigned int offset; int segments = 0; int ret; if (rq->bio != rq->biotail) { struct req_iterator iter; struct bio_vec tmp; __rq_for_each_bio(bio, rq) segments += bio_segments(bio); bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO); if (!bvec) return -EIO; cmd->bvec = bvec; /* * The bios of the request may be started from the middle of * the 'bvec' because of bio splitting, so we can't directly * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment * API will take care of all details for us. */ rq_for_each_segment(tmp, rq, iter) { *bvec = tmp; bvec++; } bvec = cmd->bvec; offset = 0; } else { /* * Same here, this bio may be started from the middle of the * 'bvec' because of bio splitting, so offset from the bvec * must be passed to iov iterator */ offset = bio->bi_iter.bi_bvec_done; bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); segments = bio_segments(bio); } atomic_set(&cmd->ref, 2); iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, segments, blk_rq_bytes(rq)); iter.iov_offset = offset; cmd->iocb.ki_pos = pos; cmd->iocb.ki_filp = file; cmd->iocb.ki_complete = lo_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; if (rw == WRITE) ret = call_write_iter(file, &cmd->iocb, &iter); else ret = call_read_iter(file, &cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); if (ret != -EIOCBQUEUED) cmd->iocb.ki_complete(&cmd->iocb, ret, 0); return 0; } static int do_req_filebacked(struct loop_device *lo, struct request *rq) { struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; /* * lo_write_simple and lo_read_simple should have been covered * by io submit style function like lo_rw_aio(), one blocker * is that lo_read_simple() need to call flush_dcache_page after * the page is written from kernel, and it isn't easy to handle * this in io submit style function which submits all segments * of the req at one time. And direct read IO doesn't need to * run flush_dcache_page(). */ switch (req_op(rq)) { case REQ_OP_FLUSH: return lo_req_flush(lo, rq); case REQ_OP_WRITE_ZEROES: /* * If the caller doesn't want deallocation, call zeroout to * write zeroes the range. Otherwise, punch them out. */ return lo_fallocate(lo, rq, pos, (rq->cmd_flags & REQ_NOUNMAP) ? FALLOC_FL_ZERO_RANGE : FALLOC_FL_PUNCH_HOLE); case REQ_OP_DISCARD: return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE); case REQ_OP_WRITE: if (lo->transfer) return lo_write_transfer(lo, rq, pos); else if (cmd->use_aio) return lo_rw_aio(lo, cmd, pos, WRITE); else return lo_write_simple(lo, rq, pos); case REQ_OP_READ: if (lo->transfer) return lo_read_transfer(lo, rq, pos); else if (cmd->use_aio) return lo_rw_aio(lo, cmd, pos, READ); else return lo_read_simple(lo, rq, pos); default: WARN_ON_ONCE(1); return -EIO; break; } } static inline void loop_update_dio(struct loop_device *lo) { __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | lo->use_dio); } static void loop_reread_partitions(struct loop_device *lo, struct block_device *bdev) { int rc; /* * bd_mutex has been held already in release path, so don't * acquire it if this function is called in such case. * * If the reread partition isn't from release path, lo_refcnt * must be at least one and it can only become zero when the * current holder is released. */ if (!atomic_read(&lo->lo_refcnt)) rc = __blkdev_reread_part(bdev); else rc = blkdev_reread_part(bdev); if (rc) pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", __func__, lo->lo_number, lo->lo_file_name, rc); } static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; } static int loop_validate_file(struct file *file, struct block_device *bdev) { struct inode *inode = file->f_mapping->host; struct file *f = file; /* Avoid recursion */ while (is_loop_device(f)) { struct loop_device *l; if (f->f_mapping->host->i_bdev == bdev) return -EBADF; l = f->f_mapping->host->i_bdev->bd_disk->private_data; if (l->lo_state == Lo_unbound) { return -EINVAL; } f = l->lo_backing_file; } if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) return -EINVAL; return 0; } /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up * the original file and in High Availability environments to switch to * an alternative location for the content in case of server meltdown. * This can only work if the loop device is used read-only, and if the * new backing store is the same size and type as the old backing store. */ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, unsigned int arg) { struct file *file, *old_file; struct inode *inode; int error; error = -ENXIO; if (lo->lo_state != Lo_bound) goto out; /* the loop device has to be read-only */ error = -EINVAL; if (!(lo->lo_flags & LO_FLAGS_READ_ONLY)) goto out; error = -EBADF; file = fget(arg); if (!file) goto out; error = loop_validate_file(file, bdev); if (error) goto out_putf; inode = file->f_mapping->host; old_file = lo->lo_backing_file; error = -EINVAL; /* size of the new backing store needs to be the same */ if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) goto out_putf; /* and ... switch */ blk_mq_freeze_queue(lo->lo_queue); mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); lo->lo_backing_file = file; lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping); mapping_set_gfp_mask(file->f_mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); fput(old_file); if (lo->lo_flags & LO_FLAGS_PARTSCAN) loop_reread_partitions(lo, bdev); return 0; out_putf: fput(file); out: return error; } /* loop sysfs attributes */ static ssize_t loop_attr_show(struct device *dev, char *page, ssize_t (*callback)(struct loop_device *, char *)) { struct gendisk *disk = dev_to_disk(dev); struct loop_device *lo = disk->private_data; return callback(lo, page); } #define LOOP_ATTR_RO(_name) \ static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \ static ssize_t loop_attr_do_show_##_name(struct device *d, \ struct device_attribute *attr, char *b) \ { \ return loop_attr_show(d, b, loop_attr_##_name##_show); \ } \ static struct device_attribute loop_attr_##_name = \ __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL); static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf) { ssize_t ret; char *p = NULL; spin_lock_irq(&lo->lo_lock); if (lo->lo_backing_file) p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1); spin_unlock_irq(&lo->lo_lock); if (IS_ERR_OR_NULL(p)) ret = PTR_ERR(p); else { ret = strlen(p); memmove(buf, p, ret); buf[ret++] = '\n'; buf[ret] = 0; } return ret; } static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset); } static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf) { return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit); } static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) { int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR); return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0"); } static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) { int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); return sysfs_emit(buf, "%s\n", partscan ? "1" : "0"); } static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) { int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); return sysfs_emit(buf, "%s\n", dio ? "1" : "0"); } LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); LOOP_ATTR_RO(partscan); LOOP_ATTR_RO(dio); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, &loop_attr_offset.attr, &loop_attr_sizelimit.attr, &loop_attr_autoclear.attr, &loop_attr_partscan.attr, &loop_attr_dio.attr, NULL, }; static struct attribute_group loop_attribute_group = { .name = "loop", .attrs= loop_attrs, }; static void loop_sysfs_init(struct loop_device *lo) { lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj, &loop_attribute_group); } static void loop_sysfs_exit(struct loop_device *lo) { if (lo->sysfs_inited) sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj, &loop_attribute_group); } static void loop_config_discard(struct loop_device *lo) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; struct request_queue *q = lo->lo_queue; /* * We use punch hole to reclaim the free space used by the * image a.k.a. discard. However we do not support discard if * encryption is enabled, because it may give an attacker * useful information. */ if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { q->limits.discard_granularity = 0; q->limits.discard_alignment = 0; blk_queue_max_discard_sectors(q, 0); blk_queue_max_write_zeroes_sectors(q, 0); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); return; } q->limits.discard_granularity = inode->i_sb->s_blocksize; q->limits.discard_alignment = 0; blk_queue_max_discard_sectors(q, UINT_MAX >> 9); blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9); queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); } static void loop_unprepare_queue(struct loop_device *lo) { kthread_flush_worker(&lo->worker); kthread_stop(lo->worker_task); } static int loop_kthread_worker_fn(void *worker_ptr) { current->flags |= PF_LESS_THROTTLE | PF_MEMALLOC_NOIO; return kthread_worker_fn(worker_ptr); } static int loop_prepare_queue(struct loop_device *lo) { kthread_init_worker(&lo->worker); lo->worker_task = kthread_run(loop_kthread_worker_fn, &lo->worker, "loop%d", lo->lo_number); if (IS_ERR(lo->worker_task)) return -ENOMEM; set_user_nice(lo->worker_task, MIN_NICE); return 0; } static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct block_device *bdev, unsigned int arg) { struct file *file; struct inode *inode; struct address_space *mapping; int lo_flags = 0; int error; loff_t size; /* This is safe, since we have a reference from open(). */ __module_get(THIS_MODULE); error = -EBADF; file = fget(arg); if (!file) goto out; error = -EBUSY; if (lo->lo_state != Lo_unbound) goto out_putf; error = loop_validate_file(file, bdev); if (error) goto out_putf; mapping = file->f_mapping; inode = mapping->host; if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || !file->f_op->write_iter) lo_flags |= LO_FLAGS_READ_ONLY; error = -EFBIG; size = get_loop_size(lo, file); if ((loff_t)(sector_t)size != size) goto out_putf; error = loop_prepare_queue(lo); if (error) goto out_putf; error = 0; set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); lo->use_dio = false; lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; lo->transfer = NULL; lo->ioctl = NULL; lo->lo_sizelimit = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_write_cache(lo->lo_queue, true, false); loop_update_dio(lo); set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); loop_sysfs_init(lo); /* let user-space know about the new size */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); set_blocksize(bdev, S_ISBLK(inode->i_mode) ? block_size(inode->i_bdev) : PAGE_SIZE); lo->lo_state = Lo_bound; if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) loop_reread_partitions(lo, bdev); /* Grab the block_device to prevent its destruction after we * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). */ bdgrab(bdev); return 0; out_putf: fput(file); out: /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); return error; } static int loop_release_xfer(struct loop_device *lo) { int err = 0; struct loop_func_table *xfer = lo->lo_encryption; if (xfer) { if (xfer->release) err = xfer->release(lo); lo->transfer = NULL; lo->lo_encryption = NULL; module_put(xfer->owner); } return err; } static int loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, const struct loop_info64 *i) { int err = 0; if (xfer) { struct module *owner = xfer->owner; if (!try_module_get(owner)) return -EINVAL; if (xfer->init) err = xfer->init(lo, i); if (err) module_put(owner); else lo->lo_encryption = xfer; } return err; } static int loop_clr_fd(struct loop_device *lo) { struct file *filp = lo->lo_backing_file; gfp_t gfp = lo->old_gfp_mask; struct block_device *bdev = lo->lo_device; if (lo->lo_state != Lo_bound) return -ENXIO; /* * If we've explicitly asked to tear down the loop device, * and it has an elevated reference count, set it for auto-teardown when * the last reference goes away. This stops $!~#$@ udev from * preventing teardown because it decided that it needs to run blkid on * the loopback device whenever they appear. xfstests is notorious for * failing tests because blkid via udev races with a losetup * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d * command to fail with EBUSY. */ if (atomic_read(&lo->lo_refcnt) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; mutex_unlock(&lo->lo_ctl_mutex); return 0; } if (filp == NULL) return -EINVAL; /* freeze request queue during the transition */ blk_mq_freeze_queue(lo->lo_queue); spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_rundown; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); loop_release_xfer(lo); lo->transfer = NULL; lo->ioctl = NULL; lo->lo_device = NULL; lo->lo_encryption = NULL; lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); blk_queue_logical_block_size(lo->lo_queue, 512); blk_queue_physical_block_size(lo->lo_queue, 512); blk_queue_io_min(lo->lo_queue, 512); if (bdev) { bdput(bdev); invalidate_bdev(bdev); } set_capacity(lo->lo_disk, 0); loop_sysfs_exit(lo); if (bdev) { bd_set_size(bdev, 0); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } mapping_set_gfp_mask(filp->f_mapping, gfp); lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); blk_mq_unfreeze_queue(lo->lo_queue); if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) loop_reread_partitions(lo, bdev); lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; loop_unprepare_queue(lo); mutex_unlock(&lo->lo_ctl_mutex); /* * Need not hold lo_ctl_mutex to fput backing file. * Calling fput holding lo_ctl_mutex triggers a circular * lock dependency possibility warning as fput can take * bd_mutex which is usually taken before lo_ctl_mutex. */ fput(filp); return 0; } static int loop_set_status(struct loop_device *lo, const struct loop_info64 *info) { int err; struct loop_func_table *xfer; kuid_t uid = current_uid(); if (lo->lo_encrypt_key_size && !uid_eq(lo->lo_key_owner, uid) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (lo->lo_state != Lo_bound) return -ENXIO; if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) return -EINVAL; if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) { sync_blockdev(lo->lo_device); invalidate_bdev(lo->lo_device); } /* I/O need to be drained during transfer transition */ blk_mq_freeze_queue(lo->lo_queue); err = loop_release_xfer(lo); if (err) goto exit; if (info->lo_encrypt_type) { unsigned int type = info->lo_encrypt_type; if (type >= MAX_LO_CRYPT) { err = -EINVAL; goto exit; } xfer = xfer_funcs[type]; if (xfer == NULL) { err = -EINVAL; goto exit; } } else xfer = NULL; err = loop_init_xfer(lo, xfer, info); if (err) goto exit; if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) { /* kill_bdev should have truncated all the pages */ if (lo->lo_device->bd_inode->i_mapping->nrpages) { err = -EAGAIN; pr_warn("%s: loop%d (%s) has still dirty pages (nrpages=%lu)\n", __func__, lo->lo_number, lo->lo_file_name, lo->lo_device->bd_inode->i_mapping->nrpages); goto exit; } if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { err = -EFBIG; goto exit; } } loop_config_discard(lo); memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; lo->lo_crypt_name[LO_NAME_SIZE-1] = 0; if (!xfer) xfer = &none_funcs; lo->transfer = xfer->transfer; lo->ioctl = xfer->ioctl; if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) != (info->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; lo->lo_encrypt_key_size = info->lo_encrypt_key_size; lo->lo_init[0] = info->lo_init[0]; lo->lo_init[1] = info->lo_init[1]; if (info->lo_encrypt_key_size) { memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, info->lo_encrypt_key_size); lo->lo_key_owner = uid; } /* update dio if lo_offset or transfer is changed */ __loop_update_dio(lo, lo->use_dio); exit: blk_mq_unfreeze_queue(lo->lo_queue); if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { lo->lo_flags |= LO_FLAGS_PARTSCAN; lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; loop_reread_partitions(lo, lo->lo_device); } return err; } static int loop_get_status(struct loop_device *lo, struct loop_info64 *info) { struct path path; struct kstat stat; int ret; if (lo->lo_state != Lo_bound) { mutex_unlock(&lo->lo_ctl_mutex); return -ENXIO; } memset(info, 0, sizeof(*info)); info->lo_number = lo->lo_number; info->lo_offset = lo->lo_offset; info->lo_sizelimit = lo->lo_sizelimit; /* loff_t vars have been assigned __u64 */ if (lo->lo_offset < 0 || lo->lo_sizelimit < 0) return -EOVERFLOW; info->lo_flags = lo->lo_flags; memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE); memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE); info->lo_encrypt_type = lo->lo_encryption ? lo->lo_encryption->number : 0; if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) { info->lo_encrypt_key_size = lo->lo_encrypt_key_size; memcpy(info->lo_encrypt_key, lo->lo_encrypt_key, lo->lo_encrypt_key_size); } /* Drop lo_ctl_mutex while we call into the filesystem. */ path = lo->lo_backing_file->f_path; path_get(&path); mutex_unlock(&lo->lo_ctl_mutex); ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT); if (!ret) { info->lo_device = huge_encode_dev(stat.dev); info->lo_inode = stat.ino; info->lo_rdevice = huge_encode_dev(stat.rdev); } path_put(&path); return ret; } static void loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64) { memset(info64, 0, sizeof(*info64)); info64->lo_number = info->lo_number; info64->lo_device = info->lo_device; info64->lo_inode = info->lo_inode; info64->lo_rdevice = info->lo_rdevice; info64->lo_offset = info->lo_offset; info64->lo_sizelimit = 0; info64->lo_encrypt_type = info->lo_encrypt_type; info64->lo_encrypt_key_size = info->lo_encrypt_key_size; info64->lo_flags = info->lo_flags; info64->lo_init[0] = info->lo_init[0]; info64->lo_init[1] = info->lo_init[1]; if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI) memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE); else memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE); memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE); } static int loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info) { memset(info, 0, sizeof(*info)); info->lo_number = info64->lo_number; info->lo_device = info64->lo_device; info->lo_inode = info64->lo_inode; info->lo_rdevice = info64->lo_rdevice; info->lo_offset = info64->lo_offset; info->lo_encrypt_type = info64->lo_encrypt_type; info->lo_encrypt_key_size = info64->lo_encrypt_key_size; info->lo_flags = info64->lo_flags; inf