Line | Count | Source (jump to first uncovered line) |
1 | | /* Support for specifying IO affinity by various means. |
2 | | Copyright 2010 Intel Corporation |
3 | | Author: Andi Kleen |
4 | | |
5 | | libnuma is free software; you can redistribute it and/or |
6 | | modify it under the terms of the GNU Lesser General Public |
7 | | License as published by the Free Software Foundation; version |
8 | | 2.1. |
9 | | |
10 | | libnuma is distributed in the hope that it will be useful, |
11 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | | Lesser General Public License for more details. |
14 | | |
15 | | You should find a copy of v2.1 of the GNU Lesser General Public License |
16 | | somewhere on your Linux system; if not, write to the Free Software |
17 | | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ |
18 | | |
19 | | /* Notebook: |
20 | | - Separate real errors from no NUMA with fallback |
21 | | - Infiniband |
22 | | - FCoE? |
23 | | - Support for other special IO devices |
24 | | - Specifying cpu subsets inside the IO node? |
25 | | - Handle multiple IO nodes (needs kernel changes) |
26 | | - Better support for multi-path IO? |
27 | | */ |
28 | | #define _GNU_SOURCE 1 |
29 | | #include <string.h> |
30 | | #include <errno.h> |
31 | | #include <sys/stat.h> |
32 | | #include <netdb.h> |
33 | | #include <unistd.h> |
34 | | #include <stdio.h> |
35 | | #include <stdlib.h> |
36 | | #include <sys/socket.h> |
37 | | #include <sys/ioctl.h> |
38 | | #include <net/if.h> |
39 | | #include <dirent.h> |
40 | | #include <linux/rtnetlink.h> |
41 | | #include <linux/netlink.h> |
42 | | #include <sys/types.h> |
43 | | #include <sys/sysmacros.h> |
44 | | #include <ctype.h> |
45 | | #include <assert.h> |
46 | | #include <regex.h> |
47 | | #include <sys/sysmacros.h> |
48 | | #include "numa.h" |
49 | | #include "numaint.h" |
50 | | #include "sysfs.h" |
51 | | #include "affinity.h" |
52 | | #include "rtnetlink.h" |
53 | | |
54 | | static int badchar(const char *s) |
55 | 45 | { |
56 | 45 | if (strpbrk(s, "/.")) |
57 | 1 | return 1; |
58 | 44 | return 0; |
59 | 45 | } |
60 | | |
61 | | static int node_parse_failure(int ret, char *cls, const char *dev) |
62 | 51 | { |
63 | 51 | if (!cls) |
64 | 8 | cls = ""; |
65 | 51 | if (ret == -2) |
66 | 2 | numa_warn(W_node_parse1, |
67 | 2 | "Kernel does not know node mask for%s%s device `%s'", |
68 | 2 | *cls ? " " : "", cls, dev); |
69 | 49 | else |
70 | 49 | numa_warn(W_node_parse2, |
71 | 49 | "Cannot read node mask for %s device `%s'", |
72 | 49 | cls, dev); |
73 | 51 | return -1; |
74 | 51 | } |
75 | | |
76 | | /* Generic sysfs class lookup */ |
77 | | static int |
78 | | affinity_class(struct bitmask *mask, char *cls, const char *dev) |
79 | 45 | { |
80 | 45 | int ret; |
81 | 45 | while (isspace(*dev)) |
82 | 194 | dev++; |
83 | 45 | if (badchar(dev)) { |
84 | 1 | numa_warn(W_badchar, "Illegal characters in `%s' specification", |
85 | 1 | dev); |
86 | 1 | return -1; |
87 | 1 | } |
88 | | |
89 | | /* Somewhat hackish: extract device from symlink path. |
90 | | Better would be a direct backlink. This knows slightly too |
91 | | much about the actual sysfs layout. */ |
92 | 44 | char path[1024]; |
93 | 44 | char *fn = NULL; |
94 | 44 | if (asprintf(&fn, "/sys/class/%s/%s", cls, dev) > 0 && |
95 | 44 | readlink(fn, path, sizeof path) > 0) { |
96 | 4 | regex_t re; |
97 | 4 | regmatch_t match[2]; |
98 | 4 | char *p; |
99 | | |
100 | 4 | regcomp(&re, "(/devices/pci[0-9a-fA-F:/]+\\.[0-9]+)/", |
101 | 4 | REG_EXTENDED); |
102 | 4 | ret = regexec(&re, path, 2, match, 0); |
103 | 4 | regfree(&re); |
104 | 4 | if (ret == 0) { |
105 | 1 | free(fn); |
106 | 1 | assert(match[0].rm_so > 0); |
107 | 1 | assert(match[0].rm_eo > 0); |
108 | 1 | path[match[1].rm_eo + 1] = 0; |
109 | 1 | p = path + match[0].rm_so; |
110 | 1 | ret = sysfs_node_read(mask, "/sys/%s/numa_node", p); |
111 | 1 | if (ret < 0) |
112 | 1 | return node_parse_failure(ret, NULL, p); |
113 | 0 | return ret; |
114 | 1 | } |
115 | 4 | } |
116 | 43 | free(fn); |
117 | | |
118 | 43 | ret = sysfs_node_read(mask, "/sys/class/%s/%s/device/numa_node", |
119 | 43 | cls, dev); |
120 | 43 | if (ret < 0) |
121 | 43 | return node_parse_failure(ret, cls, dev); |
122 | 0 | return 0; |
123 | 43 | } |
124 | | |
125 | | /* Turn file (or device node) into class name */ |
126 | | static int affinity_file(struct bitmask *mask, char *cls, const char *file) |
127 | 3 | { |
128 | 3 | struct stat st; |
129 | 3 | DIR *dir; |
130 | 3 | int n; |
131 | 3 | unsigned maj = 0, min = 0; |
132 | 3 | dev_t d; |
133 | 3 | struct dirent *dep; |
134 | | |
135 | 3 | cls = "block"; |
136 | 3 | char fn[sizeof("/sys/class/") + strlen(cls)]; |
137 | 3 | if (stat(file, &st) < 0) { |
138 | 1 | numa_warn(W_blockdev1, "Cannot stat file %s", file); |
139 | 1 | return -1; |
140 | 1 | } |
141 | 2 | d = st.st_dev; |
142 | 2 | if (S_ISCHR(st.st_mode)) { |
143 | | /* Better choice than misc? Most likely misc will not work |
144 | | anyways unless the kernel is fixed. */ |
145 | 0 | cls = "misc"; |
146 | 0 | d = st.st_rdev; |
147 | 2 | } else if (S_ISBLK(st.st_mode)) |
148 | 0 | d = st.st_rdev; |
149 | | |
150 | 2 | sprintf(fn, "/sys/class/%s", cls); |
151 | 2 | dir = opendir(fn); |
152 | 2 | if (!dir) { |
153 | 0 | numa_warn(W_blockdev2, "Cannot enumerate %s devices in sysfs", |
154 | 0 | cls); |
155 | 0 | return -1; |
156 | 0 | } |
157 | 25 | while ((dep = readdir(dir)) != NULL) { |
158 | 24 | char *name = dep->d_name; |
159 | 24 | int ret; |
160 | | |
161 | 24 | if (*name == '.') |
162 | 4 | continue; |
163 | 20 | char *dev; |
164 | 20 | char fn2[sizeof("/sys/class/block//dev") + strlen(name)]; |
165 | | |
166 | 20 | n = -1; |
167 | 20 | if (sprintf(fn2, "/sys/class/block/%s/dev", name) < 0) |
168 | 0 | break; |
169 | 20 | dev = sysfs_read(fn2); |
170 | 20 | if (dev) { |
171 | 20 | n = sscanf(dev, "%u:%u", &maj, &min); |
172 | 20 | free(dev); |
173 | 20 | } |
174 | 20 | if (n != 2) { |
175 | 0 | numa_warn(W_blockdev3, "Cannot parse sysfs device %s", |
176 | 0 | name); |
177 | 0 | continue; |
178 | 0 | } |
179 | | |
180 | 20 | if (major(d) != maj || minor(d) != min) |
181 | 19 | continue; |
182 | | |
183 | 1 | ret = affinity_class(mask, "block", name); |
184 | 1 | closedir(dir); |
185 | 1 | return ret; |
186 | 20 | } |
187 | 1 | closedir(dir); |
188 | 1 | numa_warn(W_blockdev5, "Cannot find block device %x:%x in sysfs for `%s'", |
189 | 1 | maj, min, file); |
190 | 1 | return -1; |
191 | 2 | } |
192 | | |
193 | | /* Look up interface of route using rtnetlink. */ |
194 | | static int find_route(struct sockaddr *dst, int *iifp) |
195 | 2 | { |
196 | 2 | struct rtattr *rta; |
197 | 2 | const int hdrlen = NLMSG_LENGTH(sizeof(struct rtmsg)); |
198 | 2 | struct { |
199 | 2 | struct nlmsghdr msg; |
200 | 2 | struct rtmsg rt; |
201 | 2 | char buf[256]; |
202 | 2 | } req = { |
203 | 2 | .msg = { |
204 | 2 | .nlmsg_len = hdrlen, |
205 | 2 | .nlmsg_type = RTM_GETROUTE, |
206 | 2 | .nlmsg_flags = NLM_F_REQUEST, |
207 | 2 | }, |
208 | 2 | .rt = { |
209 | 2 | .rtm_family = dst->sa_family, |
210 | 2 | }, |
211 | 2 | }; |
212 | 2 | struct sockaddr_nl adr = { |
213 | 2 | .nl_family = AF_NETLINK, |
214 | 2 | }; |
215 | | |
216 | 2 | if (rta_put_address(&req.msg, RTA_DST, dst) < 0) { |
217 | 0 | numa_warn(W_netlink1, "Cannot handle network family %x", |
218 | 0 | dst->sa_family); |
219 | 0 | return -1; |
220 | 0 | } |
221 | | |
222 | 2 | if (rtnetlink_request(&req.msg, sizeof req, &adr) < 0) { |
223 | 0 | numa_warn(W_netlink2, "Cannot request rtnetlink route: %s", |
224 | 0 | strerror(errno)); |
225 | 0 | return -1; |
226 | 0 | } |
227 | | |
228 | | /* Fish the interface out of the netlink soup. */ |
229 | 2 | rta = NULL; |
230 | 6 | while ((rta = rta_get(&req.msg, rta, hdrlen)) != NULL) { |
231 | 6 | if (rta->rta_type == RTA_OIF) { |
232 | 2 | memcpy(iifp, RTA_DATA(rta), sizeof(int)); |
233 | 2 | return 0; |
234 | 2 | } |
235 | 6 | } |
236 | | |
237 | 0 | numa_warn(W_netlink3, "rtnetlink query did not return interface"); |
238 | 0 | return -1; |
239 | 2 | } |
240 | | |
241 | | static int iif_to_name(int iif, struct ifreq *ifr) |
242 | 2 | { |
243 | 2 | int n; |
244 | 2 | int sk = socket(PF_INET, SOCK_DGRAM, 0); |
245 | 2 | if (sk < 0) |
246 | 0 | return -1; |
247 | 2 | ifr->ifr_ifindex = iif; |
248 | 2 | n = ioctl(sk, SIOCGIFNAME, ifr); |
249 | 2 | close(sk); |
250 | 2 | return n; |
251 | 2 | } |
252 | | |
253 | | /* Resolve an IP address to the nodes of a network device. |
254 | | This generally only attempts to handle simple cases: |
255 | | no multi-path, no bounding etc. In these cases only |
256 | | the first interface or none is chosen. */ |
257 | | static int affinity_ip(struct bitmask *mask, char *cls, const char *id) |
258 | 4 | { |
259 | 4 | struct addrinfo *ai; |
260 | 4 | int n; |
261 | 4 | int iif; |
262 | 4 | struct ifreq ifr; |
263 | | |
264 | 4 | if ((n = getaddrinfo(id, NULL, NULL, &ai)) != 0) { |
265 | 2 | numa_warn(W_net1, "Cannot resolve %s: %s", |
266 | 2 | id, gai_strerror(n)); |
267 | 2 | return -1; |
268 | 2 | } |
269 | | |
270 | 2 | if (find_route(&ai->ai_addr[0], &iif) < 0) |
271 | 0 | goto out_ai; |
272 | | |
273 | 2 | if (iif_to_name(iif, &ifr) < 0) { |
274 | 0 | numa_warn(W_net2, "Cannot resolve network interface %d", iif); |
275 | 0 | goto out_ai; |
276 | 0 | } |
277 | | |
278 | 2 | freeaddrinfo(ai); |
279 | 2 | return affinity_class(mask, "net", ifr.ifr_name); |
280 | | |
281 | 0 | out_ai: |
282 | 0 | freeaddrinfo(ai); |
283 | 0 | return -1; |
284 | 2 | } |
285 | | |
286 | | /* Look up affinity for a PCI device */ |
287 | | static int affinity_pci(struct bitmask *mask, char *cls, const char *id) |
288 | 10 | { |
289 | 10 | unsigned seg, bus, dev, func; |
290 | 10 | int n, ret; |
291 | | |
292 | | /* Func is optional. */ |
293 | 10 | if ((n = sscanf(id, "%x:%x:%x.%x",&seg,&bus,&dev,&func)) == 4 || n == 3) { |
294 | 3 | if (n == 3) |
295 | 2 | func = 0; |
296 | 3 | } |
297 | | /* Segment is optional too */ |
298 | 7 | else if ((n = sscanf(id, "%x:%x.%x",&bus,&dev,&func)) == 3 || n == 2) { |
299 | 4 | seg = 0; |
300 | 4 | if (n == 2) |
301 | 3 | func = 0; |
302 | 4 | } else { |
303 | 3 | numa_warn(W_pci1, "Cannot parse PCI device `%s'", id); |
304 | 3 | return -1; |
305 | 3 | } |
306 | 7 | ret = sysfs_node_read(mask, |
307 | 7 | "/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node", |
308 | 7 | seg, bus, seg, bus, dev, func); |
309 | 7 | if (ret < 0) |
310 | 7 | return node_parse_failure(ret, cls, id); |
311 | 0 | return 0; |
312 | 7 | } |
313 | | |
314 | | static struct handler { |
315 | | char first; |
316 | | char *name; |
317 | | char *cls; |
318 | | int (*handler)(struct bitmask *mask, char *cls, const char *desc); |
319 | | } handlers[] = { |
320 | | { 'n', "netdev:", "net", affinity_class }, |
321 | | { 'i', "ip:", NULL, affinity_ip }, |
322 | | { 'f', "file:", NULL, affinity_file }, |
323 | | { 'b', "block:", "block", affinity_class }, |
324 | | { 'p', "pci:", NULL, affinity_pci }, |
325 | | {} |
326 | | }; |
327 | | |
328 | | hidden int resolve_affinity(const char *id, struct bitmask *mask) |
329 | 141 | { |
330 | 141 | struct handler *h; |
331 | | |
332 | 721 | for (h = &handlers[0]; h->first; h++) { |
333 | 639 | int len; |
334 | 639 | if (id[0] != h->first) |
335 | 525 | continue; |
336 | 114 | len = strlen(h->name); |
337 | 114 | if (!strncmp(id, h->name, len)) { |
338 | 59 | int ret = h->handler(mask, h->cls, id + len); |
339 | 59 | if (ret == -2) { |
340 | 0 | numa_warn(W_nonode, "Kernel does not know node for %s\n", |
341 | 0 | id + len); |
342 | 0 | } |
343 | 59 | return ret; |
344 | 59 | } |
345 | 114 | } |
346 | 82 | return NO_IO_AFFINITY; |
347 | 141 | } |