/* Support for specifying IO affinity by various means. Copyright 2010 Intel Corporation Author: Andi Kleen libnuma is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; version 2.1. libnuma is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should find a copy of v2.1 of the GNU Lesser General Public License somewhere on your Linux system; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* Notebook: - Separate real errors from no NUMA with fallback - Infiniband - FCoE? - Support for other special IO devices - Specifying cpu subsets inside the IO node? - Handle multiple IO nodes (needs kernel changes) - Better support for multi-path IO? */ #define _GNU_SOURCE 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "numa.h" #include "numaint.h" #include "sysfs.h" #include "affinity.h" #include "rtnetlink.h" static int badchar(const char *s) { if (strpbrk(s, "/.")) return 1; return 0; } static int node_parse_failure(int ret, char *cls, const char *dev) { if (!cls) cls = ""; if (ret == -2) numa_warn(W_node_parse1, "Kernel does not know node mask for%s%s device `%s'", *cls ? " " : "", cls, dev); else numa_warn(W_node_parse2, "Cannot read node mask for %s device `%s'", cls, dev); return -1; } /* Generic sysfs class lookup */ static int affinity_class(struct bitmask *mask, char *cls, const char *dev) { int ret; while (isspace(*dev)) dev++; if (badchar(dev)) { numa_warn(W_badchar, "Illegal characters in `%s' specification", dev); return -1; } /* Somewhat hackish: extract device from symlink path. Better would be a direct backlink. This knows slightly too much about the actual sysfs layout. */ char path[1024]; char *fn = NULL; if (asprintf(&fn, "/sys/class/%s/%s", cls, dev) > 0 && readlink(fn, path, sizeof path) > 0) { regex_t re; regmatch_t match[2]; char *p; regcomp(&re, "(/devices/pci[0-9a-fA-F:/]+\\.[0-9]+)/", REG_EXTENDED); ret = regexec(&re, path, 2, match, 0); regfree(&re); if (ret == 0) { free(fn); assert(match[0].rm_so > 0); assert(match[0].rm_eo > 0); path[match[1].rm_eo + 1] = 0; p = path + match[0].rm_so; ret = sysfs_node_read(mask, "/sys/%s/numa_node", p); if (ret < 0) return node_parse_failure(ret, NULL, p); return ret; } } free(fn); ret = sysfs_node_read(mask, "/sys/class/%s/%s/device/numa_node", cls, dev); if (ret < 0) return node_parse_failure(ret, cls, dev); return 0; } /* Turn file (or device node) into class name */ static int affinity_file(struct bitmask *mask, char *cls, const char *file) { struct stat st; DIR *dir; int n; unsigned maj = 0, min = 0; dev_t d; struct dirent *dep; cls = "block"; char fn[sizeof("/sys/class/") + strlen(cls)]; if (stat(file, &st) < 0) { numa_warn(W_blockdev1, "Cannot stat file %s", file); return -1; } d = st.st_dev; if (S_ISCHR(st.st_mode)) { /* Better choice than misc? Most likely misc will not work anyways unless the kernel is fixed. */ cls = "misc"; d = st.st_rdev; } else if (S_ISBLK(st.st_mode)) d = st.st_rdev; sprintf(fn, "/sys/class/%s", cls); dir = opendir(fn); if (!dir) { numa_warn(W_blockdev2, "Cannot enumerate %s devices in sysfs", cls); return -1; } while ((dep = readdir(dir)) != NULL) { char *name = dep->d_name; int ret; if (*name == '.') continue; char *dev; char fn2[sizeof("/sys/class/block//dev") + strlen(name)]; n = -1; if (sprintf(fn2, "/sys/class/block/%s/dev", name) < 0) break; dev = sysfs_read(fn2); if (dev) { n = sscanf(dev, "%u:%u", &maj, &min); free(dev); } if (n != 2) { numa_warn(W_blockdev3, "Cannot parse sysfs device %s", name); continue; } if (major(d) != maj || minor(d) != min) continue; ret = affinity_class(mask, "block", name); closedir(dir); return ret; } closedir(dir); numa_warn(W_blockdev5, "Cannot find block device %x:%x in sysfs for `%s'", maj, min, file); return -1; } /* Look up interface of route using rtnetlink. */ static int find_route(struct sockaddr *dst, int *iifp) { struct rtattr *rta; const int hdrlen = NLMSG_LENGTH(sizeof(struct rtmsg)); struct { struct nlmsghdr msg; struct rtmsg rt; char buf[256]; } req = { .msg = { .nlmsg_len = hdrlen, .nlmsg_type = RTM_GETROUTE, .nlmsg_flags = NLM_F_REQUEST, }, .rt = { .rtm_family = dst->sa_family, }, }; struct sockaddr_nl adr = { .nl_family = AF_NETLINK, }; if (rta_put_address(&req.msg, RTA_DST, dst) < 0) { numa_warn(W_netlink1, "Cannot handle network family %x", dst->sa_family); return -1; } if (rtnetlink_request(&req.msg, sizeof req, &adr) < 0) { numa_warn(W_netlink2, "Cannot request rtnetlink route: %s", strerror(errno)); return -1; } /* Fish the interface out of the netlink soup. */ rta = NULL; while ((rta = rta_get(&req.msg, rta, hdrlen)) != NULL) { if (rta->rta_type == RTA_OIF) { memcpy(iifp, RTA_DATA(rta), sizeof(int)); return 0; } } numa_warn(W_netlink3, "rtnetlink query did not return interface"); return -1; } static int iif_to_name(int iif, struct ifreq *ifr) { int n; int sk = socket(PF_INET, SOCK_DGRAM, 0); if (sk < 0) return -1; ifr->ifr_ifindex = iif; n = ioctl(sk, SIOCGIFNAME, ifr); close(sk); return n; } /* Resolve an IP address to the nodes of a network device. This generally only attempts to handle simple cases: no multi-path, no bounding etc. In these cases only the first interface or none is chosen. */ static int affinity_ip(struct bitmask *mask, char *cls, const char *id) { struct addrinfo *ai; int n; int iif; struct ifreq ifr; if ((n = getaddrinfo(id, NULL, NULL, &ai)) != 0) { numa_warn(W_net1, "Cannot resolve %s: %s", id, gai_strerror(n)); return -1; } if (find_route(&ai->ai_addr[0], &iif) < 0) goto out_ai; if (iif_to_name(iif, &ifr) < 0) { numa_warn(W_net2, "Cannot resolve network interface %d", iif); goto out_ai; } freeaddrinfo(ai); return affinity_class(mask, "net", ifr.ifr_name); out_ai: freeaddrinfo(ai); return -1; } /* Look up affinity for a PCI device */ static int affinity_pci(struct bitmask *mask, char *cls, const char *id) { unsigned seg, bus, dev, func; int n, ret; /* Func is optional. */ if ((n = sscanf(id, "%x:%x:%x.%x",&seg,&bus,&dev,&func)) == 4 || n == 3) { if (n == 3) func = 0; } /* Segment is optional too */ else if ((n = sscanf(id, "%x:%x.%x",&bus,&dev,&func)) == 3 || n == 2) { seg = 0; if (n == 2) func = 0; } else { numa_warn(W_pci1, "Cannot parse PCI device `%s'", id); return -1; } ret = sysfs_node_read(mask, "/sys/devices/pci%04x:%02x/%04x:%02x:%02x.%x/numa_node", seg, bus, seg, bus, dev, func); if (ret < 0) return node_parse_failure(ret, cls, id); return 0; } static struct handler { char first; char *name; char *cls; int (*handler)(struct bitmask *mask, char *cls, const char *desc); } handlers[] = { { 'n', "netdev:", "net", affinity_class }, { 'i', "ip:", NULL, affinity_ip }, { 'f', "file:", NULL, affinity_file }, { 'b', "block:", "block", affinity_class }, { 'p', "pci:", NULL, affinity_pci }, {} }; hidden int resolve_affinity(const char *id, struct bitmask *mask) { struct handler *h; for (h = &handlers[0]; h->first; h++) { int len; if (id[0] != h->first) continue; len = strlen(h->name); if (!strncmp(id, h->name, len)) { int ret = h->handler(mask, h->cls, id + len); if (ret == -2) { numa_warn(W_nonode, "Kernel does not know node for %s\n", id + len); } return ret; } } return NO_IO_AFFINITY; }