/* * lxc: linux Container library * * (C) Copyright IBM Corp. 2007, 2008 * * Authors: * Daniel Lezcano * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ #include "config.h" #include #include #include #include #include #include #include #include #include #include #include #if HAVE_PTY_H #include #else #include <../include/openpty.h> #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "network.h" #include "error.h" #include "parse.h" #include "utils.h" #include "conf.h" #include "log.h" #include "caps.h" /* for lxc_caps_last_cap() */ #include "bdev.h" #include "cgroup.h" #include "lxclock.h" #include "namespace.h" #include "lsm/lsm.h" #if HAVE_SYS_CAPABILITY_H #include #endif #if HAVE_SYS_PERSONALITY_H #include #endif #if IS_BIONIC #include <../include/lxcmntent.h> #else #include #endif #include "lxcseccomp.h" lxc_log_define(lxc_conf, lxc); #define MAXHWLEN 18 #define MAXINDEXLEN 20 #define MAXMTULEN 16 #define MAXLINELEN 128 #if HAVE_SYS_CAPABILITY_H #ifndef CAP_SETFCAP #define CAP_SETFCAP 31 #endif #ifndef CAP_MAC_OVERRIDE #define CAP_MAC_OVERRIDE 32 #endif #ifndef CAP_MAC_ADMIN #define CAP_MAC_ADMIN 33 #endif #endif #ifndef PR_CAPBSET_DROP #define PR_CAPBSET_DROP 24 #endif #ifndef LO_FLAGS_AUTOCLEAR #define LO_FLAGS_AUTOCLEAR 4 #endif /* Define pivot_root() if missing from the C library */ #ifndef HAVE_PIVOT_ROOT static int pivot_root(const char * new_root, const char * put_old) { #ifdef __NR_pivot_root return syscall(__NR_pivot_root, new_root, put_old); #else errno = ENOSYS; return -1; #endif } #else extern int pivot_root(const char * new_root, const char * put_old); #endif /* Define sethostname() if missing from the C library */ #ifndef HAVE_SETHOSTNAME static int sethostname(const char * name, size_t len) { #ifdef __NR_sethostname return syscall(__NR_sethostname, name, len); #else errno = ENOSYS; return -1; #endif } #endif /* Define __S_ISTYPE if missing from the C library */ #ifndef __S_ISTYPE #define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask)) #endif char *lxchook_names[NUM_LXC_HOOKS] = { "pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" }; typedef int (*instanciate_cb)(struct lxc_handler *, struct lxc_netdev *); struct mount_opt { char *name; int clear; int flag; }; struct caps_opt { char *name; int value; }; static int instanciate_veth(struct lxc_handler *, struct lxc_netdev *); static int instanciate_macvlan(struct lxc_handler *, struct lxc_netdev *); static int instanciate_vlan(struct lxc_handler *, struct lxc_netdev *); static int instanciate_phys(struct lxc_handler *, struct lxc_netdev *); static int instanciate_empty(struct lxc_handler *, struct lxc_netdev *); static int instanciate_none(struct lxc_handler *, struct lxc_netdev *); static instanciate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = { [LXC_NET_VETH] = instanciate_veth, [LXC_NET_MACVLAN] = instanciate_macvlan, [LXC_NET_VLAN] = instanciate_vlan, [LXC_NET_PHYS] = instanciate_phys, [LXC_NET_EMPTY] = instanciate_empty, [LXC_NET_NONE] = instanciate_none, }; static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *); static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *); static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *); static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *); static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *); static int shutdown_none(struct lxc_handler *, struct lxc_netdev *); static instanciate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = { [LXC_NET_VETH] = shutdown_veth, [LXC_NET_MACVLAN] = shutdown_macvlan, [LXC_NET_VLAN] = shutdown_vlan, [LXC_NET_PHYS] = shutdown_phys, [LXC_NET_EMPTY] = shutdown_empty, [LXC_NET_NONE] = shutdown_none, }; static struct mount_opt mount_opt[] = { { "defaults", 0, 0 }, { "ro", 0, MS_RDONLY }, { "rw", 1, MS_RDONLY }, { "suid", 1, MS_NOSUID }, { "nosuid", 0, MS_NOSUID }, { "dev", 1, MS_NODEV }, { "nodev", 0, MS_NODEV }, { "exec", 1, MS_NOEXEC }, { "noexec", 0, MS_NOEXEC }, { "sync", 0, MS_SYNCHRONOUS }, { "async", 1, MS_SYNCHRONOUS }, { "dirsync", 0, MS_DIRSYNC }, { "remount", 0, MS_REMOUNT }, { "mand", 0, MS_MANDLOCK }, { "nomand", 1, MS_MANDLOCK }, { "atime", 1, MS_NOATIME }, { "noatime", 0, MS_NOATIME }, { "diratime", 1, MS_NODIRATIME }, { "nodiratime", 0, MS_NODIRATIME }, { "bind", 0, MS_BIND }, { "rbind", 0, MS_BIND|MS_REC }, { "relatime", 0, MS_RELATIME }, { "norelatime", 1, MS_RELATIME }, { "strictatime", 0, MS_STRICTATIME }, { "nostrictatime", 1, MS_STRICTATIME }, { NULL, 0, 0 }, }; #if HAVE_SYS_CAPABILITY_H static struct caps_opt caps_opt[] = { { "chown", CAP_CHOWN }, { "dac_override", CAP_DAC_OVERRIDE }, { "dac_read_search", CAP_DAC_READ_SEARCH }, { "fowner", CAP_FOWNER }, { "fsetid", CAP_FSETID }, { "kill", CAP_KILL }, { "setgid", CAP_SETGID }, { "setuid", CAP_SETUID }, { "setpcap", CAP_SETPCAP }, { "linux_immutable", CAP_LINUX_IMMUTABLE }, { "net_bind_service", CAP_NET_BIND_SERVICE }, { "net_broadcast", CAP_NET_BROADCAST }, { "net_admin", CAP_NET_ADMIN }, { "net_raw", CAP_NET_RAW }, { "ipc_lock", CAP_IPC_LOCK }, { "ipc_owner", CAP_IPC_OWNER }, { "sys_module", CAP_SYS_MODULE }, { "sys_rawio", CAP_SYS_RAWIO }, { "sys_chroot", CAP_SYS_CHROOT }, { "sys_ptrace", CAP_SYS_PTRACE }, { "sys_pacct", CAP_SYS_PACCT }, { "sys_admin", CAP_SYS_ADMIN }, { "sys_boot", CAP_SYS_BOOT }, { "sys_nice", CAP_SYS_NICE }, { "sys_resource", CAP_SYS_RESOURCE }, { "sys_time", CAP_SYS_TIME }, { "sys_tty_config", CAP_SYS_TTY_CONFIG }, { "mknod", CAP_MKNOD }, { "lease", CAP_LEASE }, #ifdef CAP_AUDIT_WRITE { "audit_write", CAP_AUDIT_WRITE }, #endif #ifdef CAP_AUDIT_CONTROL { "audit_control", CAP_AUDIT_CONTROL }, #endif { "setfcap", CAP_SETFCAP }, { "mac_override", CAP_MAC_OVERRIDE }, { "mac_admin", CAP_MAC_ADMIN }, #ifdef CAP_SYSLOG { "syslog", CAP_SYSLOG }, #endif #ifdef CAP_WAKE_ALARM { "wake_alarm", CAP_WAKE_ALARM }, #endif }; #else static struct caps_opt caps_opt[] = {}; #endif static int run_buffer(char *buffer) { struct lxc_popen_FILE *f; char *output; int ret; f = lxc_popen(buffer); if (!f) { SYSERROR("popen failed"); return -1; } output = malloc(LXC_LOG_BUFFER_SIZE); if (!output) { ERROR("failed to allocate memory for script output"); lxc_pclose(f); return -1; } while(fgets(output, LXC_LOG_BUFFER_SIZE, f->f)) DEBUG("script output: %s", output); free(output); ret = lxc_pclose(f); if (ret == -1) { SYSERROR("Script exited on error"); return -1; } else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) { ERROR("Script exited with status %d", WEXITSTATUS(ret)); return -1; } else if (WIFSIGNALED(ret)) { ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret), strsignal(WTERMSIG(ret))); return -1; } return 0; } static int run_script_argv(const char *name, const char *section, const char *script, const char *hook, const char *lxcpath, char **argsin) { int ret, i; char *buffer; size_t size = 0; INFO("Executing script '%s' for container '%s', config section '%s'", script, name, section); for (i=0; argsin && argsin[i]; i++) size += strlen(argsin[i]) + 1; size += strlen(hook) + 1; size += strlen(script); size += strlen(name); size += strlen(section); size += 3; if (size > INT_MAX) return -1; buffer = alloca(size); if (!buffer) { ERROR("failed to allocate memory"); return -1; } ret = snprintf(buffer, size, "%s %s %s %s", script, name, section, hook); if (ret < 0 || ret >= size) { ERROR("Script name too long"); return -1; } for (i=0; argsin && argsin[i]; i++) { int len = size-ret; int rc; rc = snprintf(buffer + ret, len, " %s", argsin[i]); if (rc < 0 || rc >= len) { ERROR("Script args too long"); return -1; } ret += rc; } return run_buffer(buffer); } static int run_script(const char *name, const char *section, const char *script, ...) { int ret; char *buffer, *p; size_t size = 0; va_list ap; INFO("Executing script '%s' for container '%s', config section '%s'", script, name, section); va_start(ap, script); while ((p = va_arg(ap, char *))) size += strlen(p) + 1; va_end(ap); size += strlen(script); size += strlen(name); size += strlen(section); size += 3; if (size > INT_MAX) return -1; buffer = alloca(size); if (!buffer) { ERROR("failed to allocate memory"); return -1; } ret = snprintf(buffer, size, "%s %s %s", script, name, section); if (ret < 0 || ret >= size) { ERROR("Script name too long"); return -1; } va_start(ap, script); while ((p = va_arg(ap, char *))) { int len = size-ret; int rc; rc = snprintf(buffer + ret, len, " %s", p); if (rc < 0 || rc >= len) { ERROR("Script args too long"); return -1; } ret += rc; } va_end(ap); return run_buffer(buffer); } static int find_fstype_cb(char* buffer, void *data) { struct cbarg { const char *rootfs; const char *target; const char *options; } *cbarg = data; unsigned long mntflags; char *mntdata; char *fstype; /* we don't try 'nodev' entries */ if (strstr(buffer, "nodev")) return 0; fstype = buffer; fstype += lxc_char_left_gc(fstype, strlen(fstype)); fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0'; DEBUG("trying to mount '%s'->'%s' with fstype '%s'", cbarg->rootfs, cbarg->target, fstype); if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) { free(mntdata); return -1; } if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) { DEBUG("mount failed with error: %s", strerror(errno)); free(mntdata); return 0; } free(mntdata); INFO("mounted '%s' on '%s', with fstype '%s'", cbarg->rootfs, cbarg->target, fstype); return 1; } static int mount_unknown_fs(const char *rootfs, const char *target, const char *options) { int i; struct cbarg { const char *rootfs; const char *target; const char *options; } cbarg = { .rootfs = rootfs, .target = target, .options = options, }; /* * find the filesystem type with brute force: * first we check with /etc/filesystems, in case the modules * are auto-loaded and fall back to the supported kernel fs */ char *fsfile[] = { "/etc/filesystems", "/proc/filesystems", }; for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) { int ret; if (access(fsfile[i], F_OK)) continue; ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg); if (ret < 0) { ERROR("failed to parse '%s'", fsfile[i]); return -1; } if (ret) return 0; } ERROR("failed to determine fs type for '%s'", rootfs); return -1; } static int mount_rootfs_dir(const char *rootfs, const char *target, const char *options) { unsigned long mntflags; char *mntdata; int ret; if (parse_mntopts(options, &mntflags, &mntdata) < 0) { free(mntdata); return -1; } ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata); free(mntdata); return ret; } static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo) { int rfd; int ret = -1; rfd = open(rootfs, O_RDWR); if (rfd < 0) { SYSERROR("failed to open '%s'", rootfs); return -1; } memset(loinfo, 0, sizeof(*loinfo)); loinfo->lo_flags = LO_FLAGS_AUTOCLEAR; if (ioctl(fd, LOOP_SET_FD, rfd)) { SYSERROR("failed to LOOP_SET_FD"); goto out; } if (ioctl(fd, LOOP_SET_STATUS64, loinfo)) { SYSERROR("failed to LOOP_SET_STATUS64"); goto out; } ret = 0; out: close(rfd); return ret; } static int mount_rootfs_file(const char *rootfs, const char *target, const char *options) { struct dirent dirent, *direntp; struct loop_info64 loinfo; int ret = -1, fd = -1, rc; DIR *dir; char path[MAXPATHLEN]; dir = opendir("/dev"); if (!dir) { SYSERROR("failed to open '/dev'"); return -1; } while (!readdir_r(dir, &dirent, &direntp)) { if (!direntp) break; if (!strcmp(direntp->d_name, ".")) continue; if (!strcmp(direntp->d_name, "..")) continue; if (strncmp(direntp->d_name, "loop", 4)) continue; rc = snprintf(path, MAXPATHLEN, "/dev/%s", direntp->d_name); if (rc < 0 || rc >= MAXPATHLEN) continue; fd = open(path, O_RDWR); if (fd < 0) continue; if (ioctl(fd, LOOP_GET_STATUS64, &loinfo) == 0) { close(fd); continue; } if (errno != ENXIO) { WARN("unexpected error for ioctl on '%s': %m", direntp->d_name); close(fd); continue; } DEBUG("found '%s' free lodev", path); ret = setup_lodev(rootfs, fd, &loinfo); if (!ret) ret = mount_unknown_fs(path, target, options); close(fd); break; } if (closedir(dir)) WARN("failed to close directory"); return ret; } static int mount_rootfs_block(const char *rootfs, const char *target, const char *options) { return mount_unknown_fs(rootfs, target, options); } /* * pin_rootfs * if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for * the duration of the container run, to prevent the container from marking * the underlying fs readonly on shutdown. unlink the file immediately so * no name pollution is happens * return -1 on error. * return -2 if nothing needed to be pinned. * return an open fd (>=0) if we pinned it. */ int pin_rootfs(const char *rootfs) { char absrootfs[MAXPATHLEN]; char absrootfspin[MAXPATHLEN]; struct stat s; int ret, fd; if (rootfs == NULL || strlen(rootfs) == 0) return -2; if (!realpath(rootfs, absrootfs)) return -2; if (access(absrootfs, F_OK)) return -1; if (stat(absrootfs, &s)) return -1; if (!S_ISDIR(s.st_mode)) return -2; ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs); if (ret >= MAXPATHLEN) return -1; fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR); if (fd < 0) return fd; (void)unlink(absrootfspin); return fd; } static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler) { int r; size_t i; static struct { int match_mask; int match_flag; const char *source; const char *destination; const char *fstype; unsigned long flags; const char *options; } default_mounts[] = { /* Read-only bind-mounting... In older kernels, doing that required * to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same * one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from * kernel 2.6.26 onwards. However, this apparently does not work on * kernel 3.8. Unfortunately, on that very same kernel, doing the * same trick as above doesn't seem to work either, there one needs * to ALSO specify MS_BIND for the remount, otherwise the entire * fs is remounted read-only or the mount fails because it's busy... * MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as * 2.6.32... */ { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL }, { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL }, { LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL }, { 0, 0, NULL, NULL, NULL, 0, NULL } }; for (i = 0; default_mounts[i].match_mask; i++) { if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) { char *source = NULL; char *destination = NULL; int saved_errno; if (default_mounts[i].source) { /* will act like strdup if %r is not present */ source = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].source); if (!source) { SYSERROR("memory allocation error"); return -1; } } if (default_mounts[i].destination) { /* will act like strdup if %r is not present */ destination = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].destination); if (!destination) { saved_errno = errno; SYSERROR("memory allocation error"); free(source); errno = saved_errno; return -1; } } r = mount(source, destination, default_mounts[i].fstype, default_mounts[i].flags, default_mounts[i].options); saved_errno = errno; if (r < 0) SYSERROR("error mounting %s on %s", source, destination); free(source); free(destination); if (r < 0) { errno = saved_errno; return -1; } } } if (flags & LXC_AUTO_CGROUP_MASK) { if (!cgroup_mount(conf->rootfs.mount, handler, flags & LXC_AUTO_CGROUP_MASK)) { SYSERROR("error mounting /sys/fs/cgroup"); return -1; } } return 0; } static void print_top_failing_dir(const char *path) { size_t len = strlen(path); char *copy = alloca(len+1), *p, *e, saved; strcpy(copy, path); p = copy; e = copy + len; while (p < e) { while (p < e && *p == '/') p++; while (p < e && *p != '/') p++; if (p >= e) return; saved = *p; *p = '\0'; if (access(copy, X_OK)) { SYSERROR("could not access %s. Please grant it 'x' " \ "access, or add an ACL for the container root.", copy); return; } *p = saved; } } static int mount_rootfs(const char *rootfs, const char *target, const char *options) { char absrootfs[MAXPATHLEN]; struct stat s; int i; typedef int (*rootfs_cb)(const char *, const char *, const char *); struct rootfs_type { int type; rootfs_cb cb; } rtfs_type[] = { { S_IFDIR, mount_rootfs_dir }, { S_IFBLK, mount_rootfs_block }, { S_IFREG, mount_rootfs_file }, }; if (!realpath(rootfs, absrootfs)) { SYSERROR("failed to get real path for '%s'", rootfs); return -1; } if (access(absrootfs, F_OK)) { SYSERROR("'%s' is not accessible", absrootfs); return -1; } if (stat(absrootfs, &s)) { SYSERROR("failed to stat '%s'", absrootfs); return -1; } for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) { if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type)) continue; return rtfs_type[i].cb(absrootfs, target, options); } ERROR("unsupported rootfs type for '%s'", absrootfs); return -1; } static int setup_utsname(struct utsname *utsname) { if (!utsname) return 0; if (sethostname(utsname->nodename, strlen(utsname->nodename))) { SYSERROR("failed to set the hostname to '%s'", utsname->nodename); return -1; } INFO("'%s' hostname has been setup", utsname->nodename); return 0; } static int setup_tty(const struct lxc_rootfs *rootfs, const struct lxc_tty_info *tty_info, char *ttydir) { char path[MAXPATHLEN], lxcpath[MAXPATHLEN]; int i, ret; if (!rootfs->path) return 0; for (i = 0; i < tty_info->nbtty; i++) { struct lxc_pty_info *pty_info = &tty_info->pty_info[i]; ret = snprintf(path, sizeof(path), "%s/dev/tty%d", rootfs->mount, i + 1); if (ret >= sizeof(path)) { ERROR("pathname too long for ttys"); return -1; } if (ttydir) { /* create dev/lxc/tty%d" */ ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/tty%d", rootfs->mount, ttydir, i + 1); if (ret >= sizeof(lxcpath)) { ERROR("pathname too long for ttys"); return -1; } ret = creat(lxcpath, 0660); if (ret==-1 && errno != EEXIST) { SYSERROR("error creating %s", lxcpath); return -1; } if (ret >= 0) close(ret); ret = unlink(path); if (ret && errno != ENOENT) { SYSERROR("error unlinking %s", path); return -1; } if (mount(pty_info->name, lxcpath, "none", MS_BIND, 0)) { WARN("failed to mount '%s'->'%s'", pty_info->name, path); continue; } ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d", ttydir, i+1); if (ret >= sizeof(lxcpath)) { ERROR("tty pathname too long"); return -1; } ret = symlink(lxcpath, path); if (ret) { SYSERROR("failed to create symlink for tty %d", i+1); return -1; } } else { /* If we populated /dev, then we need to create /dev/ttyN */ if (access(path, F_OK)) { ret = creat(path, 0660); if (ret==-1) { SYSERROR("error creating %s", path); /* this isn't fatal, continue */ } else { close(ret); } } if (mount(pty_info->name, path, "none", MS_BIND, 0)) { WARN("failed to mount '%s'->'%s'", pty_info->name, path); continue; } } } INFO("%d tty(s) has been setup", tty_info->nbtty); return 0; } static int setup_rootfs_pivot_root_cb(char *buffer, void *data) { struct lxc_list *mountlist, *listentry, *iterator; char *pivotdir, *mountpoint, *mountentry, *saveptr = NULL; int found; void **cbparm; mountentry = buffer; cbparm = (void **)data; mountlist = cbparm[0]; pivotdir = cbparm[1]; /* parse entry, first field is mountname, ignore */ mountpoint = strtok_r(mountentry, " ", &saveptr); if (!mountpoint) return -1; /* second field is mountpoint */ mountpoint = strtok_r(NULL, " ", &saveptr); if (!mountpoint) return -1; /* only consider mountpoints below old root fs */ if (strncmp(mountpoint, pivotdir, strlen(pivotdir))) return 0; /* filter duplicate mountpoints */ found = 0; lxc_list_for_each(iterator, mountlist) { if (!strcmp(iterator->elem, mountpoint)) { found = 1; break; } } if (found) return 0; /* add entry to list */ listentry = malloc(sizeof(*listentry)); if (!listentry) { SYSERROR("malloc for mountpoint listentry failed"); return -1; } listentry->elem = strdup(mountpoint); if (!listentry->elem) { SYSERROR("strdup failed"); free(listentry); return -1; } lxc_list_add_tail(mountlist, listentry); return 0; } static int umount_oldrootfs(const char *oldrootfs) { char path[MAXPATHLEN]; void *cbparm[2]; struct lxc_list mountlist, *iterator, *next; int ok, still_mounted, last_still_mounted; int rc; /* read and parse /proc/mounts in old root fs */ lxc_list_init(&mountlist); /* oldrootfs is on the top tree directory now */ rc = snprintf(path, sizeof(path), "/%s", oldrootfs); if (rc >= sizeof(path)) { ERROR("rootfs name too long"); return -1; } cbparm[0] = &mountlist; cbparm[1] = strdup(path); if (!cbparm[1]) { SYSERROR("strdup failed"); return -1; } rc = snprintf(path, sizeof(path), "%s/proc/mounts", oldrootfs); if (rc >= sizeof(path)) { ERROR("container proc/mounts name too long"); return -1; } ok = lxc_file_for_each_line(path, setup_rootfs_pivot_root_cb, &cbparm); if (ok < 0) { SYSERROR("failed to read or parse mount list '%s'", path); return -1; } /* umount filesystems until none left or list no longer shrinks */ still_mounted = 0; do { last_still_mounted = still_mounted; still_mounted = 0; lxc_list_for_each_safe(iterator, &mountlist, next) { /* umount normally */ if (!umount(iterator->elem)) { DEBUG("umounted '%s'", (char *)iterator->elem); lxc_list_del(iterator); continue; } still_mounted++; } } while (still_mounted > 0 && still_mounted != last_still_mounted); lxc_list_for_each(iterator, &mountlist) { /* let's try a lazy umount */ if (!umount2(iterator->elem, MNT_DETACH)) { INFO("lazy unmount of '%s'", (char *)iterator->elem); continue; } /* be more brutal (nfs) */ if (!umount2(iterator->elem, MNT_FORCE)) { INFO("forced unmount of '%s'", (char *)iterator->elem); continue; } WARN("failed to unmount '%s'", (char *)iterator->elem); } return 0; } static int setup_rootfs_pivot_root(const char *rootfs, const char *pivotdir) { char path[MAXPATHLEN]; int remove_pivotdir = 0; int rc; /* change into new root fs */ if (chdir(rootfs)) { SYSERROR("can't chdir to new rootfs '%s'", rootfs); return -1; } if (!pivotdir) pivotdir = "lxc_putold"; /* compute the full path to pivotdir under rootfs */ rc = snprintf(path, sizeof(path), "%s/%s", rootfs, pivotdir); if (rc >= sizeof(path)) { ERROR("pivot dir name too long"); return -1; } if (access(path, F_OK)) { if (mkdir_p(path, 0755)) { SYSERROR("failed to create pivotdir '%s'", path); return -1; } remove_pivotdir = 1; DEBUG("created '%s' directory", path); } DEBUG("mountpoint for old rootfs is '%s'", path); /* pivot_root into our new root fs */ if (pivot_root(".", path)) { SYSERROR("pivot_root syscall failed"); return -1; } if (chdir("/")) { SYSERROR("can't chdir to / after pivot_root"); return -1; } DEBUG("pivot_root syscall to '%s' successful", rootfs); /* we switch from absolute path to relative path */ if (umount_oldrootfs(pivotdir)) return -1; /* remove temporary mount point, we don't consider the removing * as fatal */ if (remove_pivotdir && rmdir(pivotdir)) WARN("can't remove mountpoint '%s': %m", pivotdir); return 0; } /* * Note: This is a verbatum copy of what is in monitor.c. We're just * usint it here to generate a safe subdirectory in /dev/ for the * containers /dev/ */ /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS. * FNV has good anti collision properties and we're not worried * about pre-image resistance or one-way-ness, we're just trying to make * the name unique in the 108 bytes of space we have. */ #define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) static uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval) { unsigned char *bp; for(bp = buf; bp < (unsigned char *)buf + len; bp++) { /* xor the bottom with the current octet */ hval ^= (uint64_t)*bp; /* gcc optimised: * multiply by the 64 bit FNV magic prime mod 2^64 */ hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); } return hval; } /* * Check to see if a directory has something mounted on it and, * if it does, return the fstype. * * Code largely based on detect_shared_rootfs below * * Returns: # of matching entries in /proc/self/mounts * if != 0 fstype is filled with the last filesystem value. * if == 0 no matches found, fstype unchanged. * * ToDo: Maybe return the mount options in another parameter... */ #define LINELEN 4096 #define MAX_FSTYPE_LEN 128 static int mount_check_fs( const char *dir, char *fstype ) { char buf[LINELEN], *p; struct stat s; FILE *f; int found_fs = 0; char *p2; DEBUG("entering mount_check_fs for %s", dir); if ( 0 != access(dir, F_OK) || 0 != stat(dir, &s) || 0 == S_ISDIR(s.st_mode) ) { return 0; } f = fopen("/proc/self/mounts", "r"); if (!f) return 0; while ((p = fgets(buf, LINELEN, f))) { p = index(buf, ' '); if( !p ) continue; *p = '\0'; p2 = p + 1; p = index(p2, ' '); if( !p ) continue; *p = '\0'; /* Compare the directory in the entry to desired */ if( strcmp( p2, dir ) ) { continue; } p2 = p + 1; p = index( p2, ' '); if( !p ) continue; *p = '\0'; ++found_fs; if( fstype ) { strncpy( fstype, p2, MAX_FSTYPE_LEN - 1 ); fstype [ MAX_FSTYPE_LEN - 1 ] = '\0'; } } fclose(f); DEBUG("mount_check_fs returning %d last %s", found_fs, fstype); return found_fs; } /* * Locate a devtmpfs mount (should be on /dev) and create a container * subdirectory on it which we can then bind mount to the container * /dev instead of mounting a tmpfs there. * If we fail, return NULL. * Else return the pointer to the name buffer with the string to * the devtmpfs subdirectory. */ static char *mk_devtmpfs(const char *name, char *path, const char *lxcpath) { int ret; struct stat s; char tmp_path[MAXPATHLEN]; char fstype[MAX_FSTYPE_LEN]; char *base_path = "/dev/.lxc"; char *user_path = "/dev/.lxc/user"; uint64_t hash; if ( 0 != access(base_path, F_OK) || 0 != stat(base_path, &s) || 0 == S_ISDIR(s.st_mode) ) { /* This is just making /dev/.lxc it better work or we're done */ ret = mkdir(base_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); if ( ret ) { SYSERROR( "Unable to create /dev/.lxc for autodev" ); return NULL; } } /* * Programmers notes: * We can not do mounts in this area of code that we want * to be visible in the host. Consequently, /dev/.lxc must * be set up earlier if we need a tmpfs mounted there. * That only affects the rare cases where autodev is enabled * for a container and devtmpfs is not mounted on /dev in the * host. In that case, we'll fall back to the old method * of mounting a tmpfs in the container and have no visibility * into the container /dev. */ if( ! mount_check_fs( "/dev", fstype ) || strcmp( "devtmpfs", fstype ) ) { /* Either /dev was not mounted or was not devtmpfs */ if ( ! mount_check_fs( "/dev/.lxc", NULL ) ) { /* * /dev/.lxc is not already mounted * Doing a mount here does no good, since * it's not visible in the host. */ ERROR("/dev/.lxc is not setup - taking fallback" ); return NULL; } } if ( 0 != access(user_path, F_OK) || 0 != stat(user_path, &s) || 0 == S_ISDIR(s.st_mode) ) { /* * This is making /dev/.lxc/user path for non-priv users. * If this doesn't work, we'll have to fall back in the * case of non-priv users. It's mode 1777 like /tmp. */ ret = mkdir(user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX); if ( ret ) { /* Issue an error but don't fail yet! */ ERROR("Unable to create /dev/.lxc/user"); } /* Umask tends to screw us up here */ chmod(user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX); } /* * Since the container name must be unique within a given * lxcpath, we're going to use a hash of the path * /lxcpath/name as our hash name in /dev/.lxc/ */ ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s", lxcpath, name); if (ret < 0 || ret >= MAXPATHLEN) return NULL; hash = fnv_64a_buf(tmp_path, ret, FNV1A_64_INIT); ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, base_path, name, hash); if (ret < 0 || ret >= MAXPATHLEN) return NULL; if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) { ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); if ( ret ) { /* Something must have failed with the base_path... * Maybe unpriv user. Try user_path now... */ INFO("Setup in /dev/.lxc failed. Trying /dev/.lxc/user." ); ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, user_path, name, hash); if (ret < 0 || ret >= MAXPATHLEN) return NULL; if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) { ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); if ( ret ) { ERROR("Container /dev setup in host /dev failed - taking fallback" ); return NULL; } } } } strcpy( path, tmp_path ); return path; } /* * Do we want to add options for max size of /dev and a file to * specify which devices to create? */ static int mount_autodev(const char *name, char *root, const char *lxcpath) { int ret; struct stat s; char path[MAXPATHLEN]; char host_path[MAXPATHLEN]; char devtmpfs_path[MAXPATHLEN]; INFO("Mounting /dev under %s", root); ret = snprintf(host_path, MAXPATHLEN, "%s/%s/rootfs.dev", lxcpath, name); if (ret < 0 || ret > MAXPATHLEN) return -1; ret = snprintf(path, MAXPATHLEN, "%s/dev", root); if (ret < 0 || ret > MAXPATHLEN) return -1; if (mk_devtmpfs( name, devtmpfs_path, lxcpath ) ) { /* * Get rid of old links and directoriess * This could be either a symlink and we remove it, * or an empty directory and we remove it, * or non-existant and we don't care, * or a non-empty directory, and we will then emit an error * but we will not fail out the process. */ unlink( host_path ); rmdir( host_path ); ret = symlink(devtmpfs_path, host_path); if ( ret < 0 ) { SYSERROR("WARNING: Failed to create symlink '%s'->'%s'", host_path, devtmpfs_path); } DEBUG("Bind mounting %s to %s", devtmpfs_path , path ); ret = mount(devtmpfs_path, path, NULL, MS_BIND, 0 ); } else { /* Only mount a tmpfs on here if we don't already a mount */ if ( ! mount_check_fs( host_path, NULL ) ) { DEBUG("Mounting tmpfs to %s", host_path ); ret = mount("none", path, "tmpfs", 0, "size=100000,mode=755"); } else { /* This allows someone to manually set up a mount */ DEBUG("Bind mounting %s to %s", host_path, path ); ret = mount(host_path , path, NULL, MS_BIND, 0 ); } } if (ret) { SYSERROR("Failed to mount /dev at %s", root); return -1; } ret = snprintf(path, MAXPATHLEN, "%s/dev/pts", root); if (ret < 0 || ret >= MAXPATHLEN) return -1; /* * If we are running on a devtmpfs mapping, dev/pts may already exist. * If not, then create it and exit if that fails... */ if ( 0 != access(path, F_OK) || 0 != stat(path, &s) || 0 == S_ISDIR(s.st_mode) ) { ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); if (ret) { SYSERROR("Failed to create /dev/pts in container"); return -1; } } INFO("Mounted /dev under %s", root); return 0; } struct lxc_devs { const char *name; mode_t mode; int maj; int min; }; static const struct lxc_devs lxc_devs[] = { { "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 }, { "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 }, { "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 }, { "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 }, { "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 }, { "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 }, { "console", S_IFCHR | S_IRUSR | S_IWUSR, 5, 1 }, }; static int setup_autodev(const char *root) { int ret; char path[MAXPATHLEN]; int i; mode_t cmask; INFO("Creating initial consoles under %s/dev", root); ret = snprintf(path, MAXPATHLEN, "%s/dev", root); if (ret < 0 || ret >= MAXPATHLEN) { ERROR("Error calculating container /dev location"); return -1; } INFO("Populating /dev under %s", root); cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH); for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) { const struct lxc_devs *d = &lxc_devs[i]; ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", root, d->name); if (ret < 0 || ret >= MAXPATHLEN) return -1; ret = mknod(path, d->mode, makedev(d->maj, d->min)); if (ret && errno != EEXIST) { SYSERROR("Error creating %s", d->name); return -1; } } umask(cmask); INFO("Populated /dev under %s", root); return 0; } /* * Detect whether / is mounted MS_SHARED. The only way I know of to * check that is through /proc/self/mountinfo. * I'm only checking for /. If the container rootfs or mount location * is MS_SHARED, but not '/', then you're out of luck - figuring that * out would be too much work to be worth it. */ #define LINELEN 4096 int detect_shared_rootfs(void) { char buf[LINELEN], *p; FILE *f; int i; char *p2; f = fopen("/proc/self/mountinfo", "r"); if (!f) return 0; while ((p = fgets(buf, LINELEN, f))) { for (p = buf, i=0; p && i < 4; i++) p = index(p+1, ' '); if (!p) continue; p2 = index(p+1, ' '); if (!p2) continue; *p2 = '\0'; if (strcmp(p+1, "/") == 0) { // this is '/'. is it shared? p = index(p2+1, ' '); if (p && strstr(p, "shared:")) { fclose(f); return 1; } } } fclose(f); return 0; } /* * I'll forgive you for asking whether all of this is needed :) The * answer is yes. * pivot_root will fail if the new root, the put_old dir, or the parent * of current->fs->root are MS_SHARED. (parent of current->fs_root may * or may not be current->fs_root - if we assumed it always was, we could * just mount --make-rslave /). So, * 1. mount a tiny tmpfs to be parent of current->fs->root. * 2. make that MS_SLAVE * 3. make a 'root' directory under that * 4. mount --rbind / under the $tinyroot/root. * 5. make that rslave * 6. chdir and chroot into $tinyroot/root * 7. $tinyroot will be unmounted by our parent in start.c */ static int chroot_into_slave(struct lxc_conf *conf) { char path[MAXPATHLEN]; const char *destpath = conf->rootfs.mount; int ret; if (mount(destpath, destpath, NULL, MS_BIND, 0)) { SYSERROR("failed to mount %s bind", destpath); return -1; } if (mount("", destpath, NULL, MS_SLAVE, 0)) { SYSERROR("failed to make %s slave", destpath); return -1; } if (mount("none", destpath, "tmpfs", 0, "size=10000,mode=755")) { SYSERROR("Failed to mount tmpfs / at %s", destpath); return -1; } ret = snprintf(path, MAXPATHLEN, "%s/root", destpath); if (ret < 0 || ret >= MAXPATHLEN) { ERROR("out of memory making root path"); return -1; } if (mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) { SYSERROR("Failed to create /dev/pts in container"); return -1; } if (mount("/", path, NULL, MS_BIND|MS_REC, 0)) { SYSERROR("Failed to rbind mount / to %s", path); return -1; } if (mount("", destpath, NULL, MS_SLAVE|MS_REC, 0)) { SYSERROR("Failed to make tmp-/ at %s rslave", path); return -1; } if (chdir(path)) { SYSERROR("Failed to chdir into tmp-/"); return -1; } if (chroot(path)) { SYSERROR("Failed to chroot into tmp-/"); return -1; } INFO("Chrooted into tmp-/ at %s", path); return 0; } static int setup_rootfs(struct lxc_conf *conf) { const struct lxc_rootfs *rootfs = &conf->rootfs; if (!rootfs->path) { if (mount("", "/", NULL, MS_SLAVE|MS_REC, 0)) { SYSERROR("Failed to make / rslave"); return -1; } return 0; } if (access(rootfs->mount, F_OK)) { SYSERROR("failed to access to '%s', check it is present", rootfs->mount); return -1; } if (access(rootfs->path, R_OK)) { print_top_failing_dir(rootfs->path); return -1; } if (detect_shared_rootfs()) { if (chroot_into_slave(conf)) { ERROR("Failed to chroot into slave /"); return -1; } } // First try mounting rootfs using a bdev struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, rootfs->options); if (bdev && bdev->ops->mount(bdev) == 0) { bdev_put(bdev); DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount); return 0; } if (bdev) bdev_put(bdev); if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) { ERROR("failed to mount rootfs"); return -1; } DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount); return 0; } static int setup_pivot_root(const struct lxc_rootfs *rootfs) { if (!rootfs->path) return 0; if (setup_rootfs_pivot_root(rootfs->mount, rootfs->pivot)) { ERROR("failed to setup pivot root"); return -1; } return 0; } static int setup_pts(int pts) { char target[PATH_MAX]; if (!pts) return 0; if (!access("/dev/pts/ptmx", F_OK) && umount("/dev/pts")) { SYSERROR("failed to umount 'dev/pts'"); return -1; } if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL, "newinstance,ptmxmode=0666,mode=0620,gid=5")) { SYSERROR("failed to mount a new instance of '/dev/pts'"); return -1; } if (access("/dev/ptmx", F_OK)) { if (!symlink("/dev/pts/ptmx", "/dev/ptmx")) goto out; SYSERROR("failed to symlink '/dev/pts/ptmx'->'/dev/ptmx'"); return -1; } if (realpath("/dev/ptmx", target) && !strcmp(target, "/dev/pts/ptmx")) goto out; /* fallback here, /dev/pts/ptmx exists just mount bind */ if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0)) { SYSERROR("mount failed '/dev/pts/ptmx'->'/dev/ptmx'"); return -1; } INFO("created new pts instance"); out: return 0; } static int setup_personality(int persona) { #if HAVE_SYS_PERSONALITY_H if (persona == -1) return 0; if (personality(persona) < 0) { SYSERROR("failed to set personality to '0x%x'", persona); return -1; } INFO("set personality to '0x%x'", persona); #endif return 0; } static int setup_dev_console(const struct lxc_rootfs *rootfs, const struct lxc_console *console) { char path[MAXPATHLEN]; struct stat s; int ret; ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount); if (ret >= sizeof(path)) { ERROR("console path too long"); return -1; } if (access(path, F_OK)) { WARN("rootfs specified but no console found at '%s'", path); return 0; } if (console->master < 0) { INFO("no console"); return 0; } if (stat(path, &s)) { SYSERROR("failed to stat '%s'", path); return -1; } if (chmod(console->name, s.st_mode)) { SYSERROR("failed to set mode '0%o' to '%s'", s.st_mode, console->name); return -1; } if (mount(console->name, path, "none", MS_BIND, 0)) { ERROR("failed to mount '%s' on '%s'", console->name, path); return -1; } INFO("console has been setup"); return 0; } static int setup_ttydir_console(const struct lxc_rootfs *rootfs, const struct lxc_console *console, char *ttydir) { char path[MAXPATHLEN], lxcpath[MAXPATHLEN]; int ret; /* create rootfs/dev/ directory */ ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount, ttydir); if (ret >= sizeof(path)) return -1; ret = mkdir(path, 0755); if (ret && errno != EEXIST) { SYSERROR("failed with errno %d to create %s", errno, path); return -1; } INFO("created %s", path); ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console", rootfs->mount, ttydir); if (ret >= sizeof(lxcpath)) { ERROR("console path too long"); return -1; } snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount); ret = unlink(path); if (ret && errno != ENOENT) { SYSERROR("error unlinking %s", path); return -1; } ret = creat(lxcpath, 0660); if (ret==-1 && errno != EEXIST) { SYSERROR("error %d creating %s", errno, lxcpath); return -1; } if (ret >= 0) close(ret); if (console->master < 0) { INFO("no console"); return 0; } if (mount(console->name, lxcpath, "none", MS_BIND, 0)) { ERROR("failed to mount '%s' on '%s'", console->name, lxcpath); return -1; } /* create symlink from rootfs/dev/console to 'lxc/console' */ ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir); if (ret >= sizeof(lxcpath)) { ERROR("lxc/console path too long"); return -1; } ret = symlink(lxcpath, path); if (ret) { SYSERROR("failed to create symlink for console"); return -1; } INFO("console has been setup on %s", lxcpath); return 0; } static int setup_console(const struct lxc_rootfs *rootfs, const struct lxc_console *console, char *ttydir) { /* We don't have a rootfs, /dev/console will be shared */ if (!rootfs->path) return 0; if (!ttydir) return setup_dev_console(rootfs, console); return setup_ttydir_console(rootfs, console, ttydir); } static int setup_kmsg(const struct lxc_rootfs *rootfs, const struct lxc_console *console) { char kpath[MAXPATHLEN]; int ret; if (!rootfs->path) return 0; ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount); if (ret < 0 || ret >= sizeof(kpath)) return -1; ret = unlink(kpath); if (ret && errno != ENOENT) { SYSERROR("error unlinking %s", kpath); return -1; } ret = symlink("console", kpath); if (ret) { SYSERROR("failed to create symlink for kmsg"); return -1; } return 0; } static void parse_mntopt(char *opt, unsigned long *flags, char **data) { struct mount_opt *mo; /* If opt is found in mount_opt, set or clear flags. * Otherwise append it to data. */ for (mo = &mount_opt[0]; mo->name != NULL; mo++) { if (!strncmp(opt, mo->name, strlen(mo->name))) { if (mo->clear) *flags &= ~mo->flag; else *flags |= mo->flag; return; } } if (strlen(*data)) strcat(*data, ","); strcat(*data, opt); } int parse_mntopts(const char *mntopts, unsigned long *mntflags, char **mntdata) { char *s, *data; char *p, *saveptr = NULL; *mntdata = NULL; *mntflags = 0L; if (!mntopts) return 0; s = strdup(mntopts); if (!s) { SYSERROR("failed to allocate memory"); return -1; } data = malloc(strlen(s) + 1); if (!data) { SYSERROR("failed to allocate memory"); free(s); return -1; } *data = 0; for (p = strtok_r(s, ",", &saveptr); p != NULL; p = strtok_r(NULL, ",", &saveptr)) parse_mntopt(p, mntflags, &data); if (*data) *mntdata = data; else free(data); free(s); return 0; } static int mount_entry(const char *fsname, const char *target, const char *fstype, unsigned long mountflags, const char *data) { if (mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data)) { SYSERROR("failed to mount '%s' on '%s'", fsname, target); return -1; } if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) { DEBUG("remounting %s on %s to respect bind or remount options", fsname, target); if (mount(fsname, target, fstype, mountflags | MS_REMOUNT, data)) { SYSERROR("failed to mount '%s' on '%s'", fsname, target); return -1; } } DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype); return 0; } static inline int mount_entry_on_systemfs(const struct mntent *mntent) { unsigned long mntflags; char *mntdata; int ret; FILE *pathfile = NULL; char* pathdirname = NULL; if (hasmntopt(mntent, "create=dir")) { if (!mkdir_p(mntent->mnt_dir, 0755)) { WARN("Failed to create mount target '%s'", mntent->mnt_dir); ret = -1; } } if (hasmntopt(mntent, "create=file") && access(mntent->mnt_dir, F_OK)) { pathdirname = strdup(mntent->mnt_dir); pathdirname = dirname(pathdirname); mkdir_p(pathdirname, 0755); pathfile = fopen(mntent->mnt_dir, "wb"); if (!pathfile) { WARN("Failed to create mount target '%s'", mntent->mnt_dir); ret = -1; } else fclose(pathfile); } if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) { free(mntdata); return -1; } ret = mount_entry(mntent->mnt_fsname, mntent->mnt_dir, mntent->mnt_type, mntflags, mntdata); if (hasmntopt(mntent, "optional") != NULL) ret = 0; free(pathdirname); free(mntdata); return ret; } static int mount_entry_on_absolute_rootfs(const struct mntent *mntent, const struct lxc_rootfs *rootfs, const char *lxc_name) { char *aux; char path[MAXPATHLEN]; unsigned long mntflags; char *mntdata; int r, ret = 0, offset; const char *lxcpath; FILE *pathfile = NULL; char *pathdirname = NULL; lxcpath = lxc_global_config_value("lxc.lxcpath"); if (!lxcpath) { ERROR("Out of memory"); return -1; } /* if rootfs->path is a blockdev path, allow container fstab to * use $lxcpath/CN/rootfs as the target prefix */ r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name); if (r < 0 || r >= MAXPATHLEN) goto skipvarlib; aux = strstr(mntent->mnt_dir, path); if (aux) { offset = strlen(path); goto skipabs; } skipvarlib: aux = strstr(mntent->mnt_dir, rootfs->path); if (!aux) { WARN("ignoring mount point '%s'", mntent->mnt_dir); goto out; } offset = strlen(rootfs->path); skipabs: r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount, aux + offset); if (r < 0 || r >= MAXPATHLEN) { WARN("pathnme too long for '%s'", mntent->mnt_dir); ret = -1; goto out; } if (hasmntopt(mntent, "create=dir")) { if (!mkdir_p(path, 0755)) { WARN("Failed to create mount target '%s'", path); ret = -1; } } if (hasmntopt(mntent, "create=file") && access(path, F_OK)) { pathdirname = strdup(path); pathdirname = dirname(pathdirname); mkdir_p(pathdirname, 0755); pathfile = fopen(path, "wb"); if (!pathfile) { WARN("Failed to create mount target '%s'", path); ret = -1; } else fclose(pathfile); } if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) { free(mntdata); return -1; } ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags, mntdata); free(mntdata); if (hasmntopt(mntent, "optional") != NULL) ret = 0; out: free(pathdirname); return ret; } static int mount_entry_on_relative_rootfs(const struct mntent *mntent, const char *rootfs) { char path[MAXPATHLEN]; unsigned long mntflags; char *mntdata; int ret; FILE *pathfile = NULL; char *pathdirname = NULL; /* relative to root mount point */ ret = snprintf(path, sizeof(path), "%s/%s", rootfs, mntent->mnt_dir); if (ret >= sizeof(path)) { ERROR("path name too long"); return -1; } if (hasmntopt(mntent, "create=dir")) { if (!mkdir_p(path, 0755)) { WARN("Failed to create mount target '%s'", path); ret = -1; } } if (hasmntopt(mntent, "create=file") && access(path, F_OK)) { pathdirname = strdup(path); pathdirname = dirname(pathdirname); mkdir_p(pathdirname, 0755); pathfile = fopen(path, "wb"); if (!pathfile) { WARN("Failed to create mount target '%s'", path); ret = -1; } else fclose(pathfile); } if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) { free(mntdata); return -1; } ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, mntflags, mntdata); if (hasmntopt(mntent, "optional") != NULL) ret = 0; free(pathdirname); free(mntdata); return ret; } static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file, const char *lxc_name) { struct mntent mntent; char buf[4096]; int ret = -1; while (getmntent_r(file, &mntent, buf, sizeof(buf))) { if (!rootfs->path) { if (mount_entry_on_systemfs(&mntent)) goto out; continue; } /* We have a separate root, mounts are relative to it */ if (mntent.mnt_dir[0] != '/') { if (mount_entry_on_relative_rootfs(&mntent, rootfs->mount)) goto out; continue; } if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name)) goto out; } ret = 0; INFO("mount points have been setup"); out: return ret; } static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab, const char *lxc_name) { FILE *file; int ret; if (!fstab) return 0; file = setmntent(fstab, "r"); if (!file) { SYSERROR("failed to use '%s'", fstab); return -1; } ret = mount_file_entries(rootfs, file, lxc_name); endmntent(file); return ret; } static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list *mount, const char *lxc_name) { FILE *file; struct lxc_list *iterator; char *mount_entry; int ret; file = tmpfile(); if (!file) { ERROR("tmpfile error: %m"); return -1; } lxc_list_for_each(iterator, mount) { mount_entry = iterator->elem; fprintf(file, "%s\n", mount_entry); } rewind(file); ret = mount_file_entries(rootfs, file, lxc_name); fclose(file); return ret; } static int setup_caps(struct lxc_list *caps) { struct lxc_list *iterator; char *drop_entry; char *ptr; int i, capid; lxc_list_for_each(iterator, caps) { drop_entry = iterator->elem; capid = -1; for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) { if (strcmp(drop_entry, caps_opt[i].name)) continue; capid = caps_opt[i].value; break; } if (capid < 0) { /* try to see if it's numeric, so the user may specify * capabilities that the running kernel knows about but * we don't */ errno = 0; capid = strtol(drop_entry, &ptr, 10); if (!ptr || *ptr != '\0' || errno != 0) /* not a valid number */ capid = -1; else if (capid > lxc_caps_last_cap()) /* we have a number but it's not a valid * capability */ capid = -1; } if (capid < 0) { ERROR("unknown capability %s", drop_entry); return -1; } DEBUG("drop capability '%s' (%d)", drop_entry, capid); if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) { SYSERROR("failed to remove %s capability", drop_entry); return -1; } } DEBUG("capabilities have been setup"); return 0; } static int dropcaps_except(struct lxc_list *caps) { struct lxc_list *iterator; char *keep_entry; char *ptr; int i, capid; int numcaps = lxc_caps_last_cap() + 1; INFO("found %d capabilities", numcaps); if (numcaps <= 0 || numcaps > 200) return -1; // caplist[i] is 1 if we keep capability i int *caplist = alloca(numcaps * sizeof(int)); memset(caplist, 0, numcaps * sizeof(int)); lxc_list_for_each(iterator, caps) { keep_entry = iterator->elem; capid = -1; for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) { if (strcmp(keep_entry, caps_opt[i].name)) continue; capid = caps_opt[i].value; break; } if (capid < 0) { /* try to see if it's numeric, so the user may specify * capabilities that the running kernel knows about but * we don't */ capid = strtol(keep_entry, &ptr, 10); if (!ptr || *ptr != '\0' || capid == INT_MIN || capid == INT_MAX) /* not a valid number */ capid = -1; else if (capid > lxc_caps_last_cap()) /* we have a number but it's not a valid * capability */ capid = -1; } if (capid < 0) { ERROR("unknown capability %s", keep_entry); return -1; } DEBUG("drop capability '%s' (%d)", keep_entry, capid); caplist[capid] = 1; } for (i=0; ielem; err = lxc_ipv4_addr_add(ifindex, &inetdev->addr, &inetdev->bcast, inetdev->prefix); if (err) { ERROR("failed to setup_ipv4_addr ifindex %d : %s", ifindex, strerror(-err)); return -1; } } return 0; } static int setup_ipv6_addr(struct lxc_list *ip, int ifindex) { struct lxc_list *iterator; struct lxc_inet6dev *inet6dev; int err; lxc_list_for_each(iterator, ip) { inet6dev = iterator->elem; err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr, &inet6dev->mcast, &inet6dev->acast, inet6dev->prefix); if (err) { ERROR("failed to setup_ipv6_addr ifindex %d : %s", ifindex, strerror(-err)); return -1; } } return 0; } static int setup_netdev(struct lxc_netdev *netdev) { char ifname[IFNAMSIZ]; char *current_ifname = ifname; int err; /* empty network namespace */ if (!netdev->ifindex) { if (netdev->flags & IFF_UP) { err = lxc_netdev_up("lo"); if (err) { ERROR("failed to set the loopback up : %s", strerror(-err)); return -1; } } return 0; } /* get the new ifindex in case of physical netdev */ if (netdev->type == LXC_NET_PHYS) if (!(netdev->ifindex = if_nametoindex(netdev->link))) { ERROR("failed to get ifindex for %s", netdev->link); return -1; } /* retrieve the name of the interface */ if (!if_indextoname(netdev->ifindex, current_ifname)) { ERROR("no interface corresponding to index '%d'", netdev->ifindex); return -1; } /* default: let the system to choose one interface name */ if (!netdev->name) netdev->name = netdev->type == LXC_NET_PHYS ? netdev->link : "eth%d"; /* rename the interface name */ err = lxc_netdev_rename_by_name(ifname, netdev->name); if (err) { ERROR("failed to rename %s->%s : %s", ifname, netdev->name, strerror(-err)); return -1; } /* Re-read the name of the interface because its name has changed * and would be automatically allocated by the system */ if (!if_indextoname(netdev->ifindex, current_ifname)) { ERROR("no interface corresponding to index '%d'", netdev->ifindex); return -1; } /* set a mac address */ if (netdev->hwaddr) { if (setup_hw_addr(netdev->hwaddr, current_ifname)) { ERROR("failed to setup hw address for '%s'", current_ifname); return -1; } } /* setup ipv4 addresses on the interface */ if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) { ERROR("failed to setup ip addresses for '%s'", ifname); return -1; } /* setup ipv6 addresses on the interface */ if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) { ERROR("failed to setup ipv6 addresses for '%s'", ifname); return -1; } /* set the network device up */ if (netdev->flags & IFF_UP) { int err; err = lxc_netdev_up(current_ifname); if (err) { ERROR("failed to set '%s' up : %s", current_ifname, strerror(-err)); return -1; } /* the network is up, make the loopback up too */ err = lxc_netdev_up("lo"); if (err) { ERROR("failed to set the loopback up : %s", strerror(-err)); return -1; } } /* We can only set up the default routes after bringing * up the interface, sine bringing up the interface adds * the link-local routes and we can't add a default * route if the gateway is not reachable. */ /* setup ipv4 gateway on the interface */ if (netdev->ipv4_gateway) { if (!(netdev->flags & IFF_UP)) { ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname); return -1; } if (lxc_list_empty(&netdev->ipv4)) { ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname); return -1; } err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway); if (err) { err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway); if (err) { ERROR("failed to add ipv4 dest for '%s': %s", ifname, strerror(-err)); } err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway); if (err) { ERROR("failed to setup ipv4 gateway for '%s': %s", ifname, strerror(-err)); if (netdev->ipv4_gateway_auto) { char buf[INET_ADDRSTRLEN]; inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf)); ERROR("tried to set autodetected ipv4 gateway '%s'", buf); } return -1; } } } /* setup ipv6 gateway on the interface */ if (netdev->ipv6_gateway) { if (!(netdev->flags & IFF_UP)) { ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname); return -1; } if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) { ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname); return -1; } err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway); if (err) { err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway); if (err) { ERROR("failed to add ipv6 dest for '%s': %s", ifname, strerror(-err)); } err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway); if (err) { ERROR("failed to setup ipv6 gateway for '%s': %s", ifname, strerror(-err)); if (netdev->ipv6_gateway_auto) { char buf[INET6_ADDRSTRLEN]; inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf)); ERROR("tried to set autodetected ipv6 gateway '%s'", buf); } return -1; } } } DEBUG("'%s' has been setup", current_ifname); return 0; } static int setup_network(struct lxc_list *network) { struct lxc_list *iterator; struct lxc_netdev *netdev; lxc_list_for_each(iterator, network) { netdev = iterator->elem; if (setup_netdev(netdev)) { ERROR("failed to setup netdev"); return -1; } } if (!lxc_list_empty(network)) INFO("network has been setup"); return 0; } void lxc_rename_phys_nics_on_shutdown(struct lxc_conf *conf) { int i; INFO("running to reset %d nic names", conf->num_savednics); for (i=0; inum_savednics; i++) { struct saved_nic *s = &conf->saved_nics[i]; INFO("resetting nic %d to %s", s->ifindex, s->orig_name); lxc_netdev_rename_by_index(s->ifindex, s->orig_name); free(s->orig_name); } conf->num_savednics = 0; free(conf->saved_nics); } static char *default_rootfs_mount = LXCROOTFSMOUNT; struct lxc_conf *lxc_conf_init(void) { struct lxc_conf *new; int i; new = malloc(sizeof(*new)); if (!new) { ERROR("lxc_conf_init : %m"); return NULL; } memset(new, 0, sizeof(*new)); new->loglevel = LXC_LOG_PRIORITY_NOTSET; new->personality = -1; new->autodev = -1; new->console.log_path = NULL; new->console.log_fd = -1; new->console.path = NULL; new->console.peer = -1; new->console.peerpty.busy = -1; new->console.peerpty.master = -1; new->console.peerpty.slave = -1; new->console.master = -1; new->console.slave = -1; new->console.name[0] = '\0'; new->maincmd_fd = -1; new->rootfs.mount = strdup(default_rootfs_mount); if (!new->rootfs.mount) { ERROR("lxc_conf_init : %m"); free(new); return NULL; } new->kmsg = 1; lxc_list_init(&new->cgroup); lxc_list_init(&new->network); lxc_list_init(&new->mount_list); lxc_list_init(&new->caps); lxc_list_init(&new->keepcaps); lxc_list_init(&new->id_map); for (i=0; ihooks[i]); lxc_list_init(&new->groups); new->lsm_aa_profile = NULL; new->lsm_se_context = NULL; new->lsm_umount_proc = 0; for (i = 0; i < LXC_NS_MAX; i++) new->inherit_ns_fd[i] = -1; return new; } static int instanciate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev) { char veth1buf[IFNAMSIZ], *veth1; char veth2buf[IFNAMSIZ], *veth2; int err; if (netdev->priv.veth_attr.pair) veth1 = netdev->priv.veth_attr.pair; else { err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX"); if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */ ERROR("veth1 name too long"); return -1; } veth1 = lxc_mkifname(veth1buf); if (!veth1) { ERROR("failed to allocate a temporary name"); return -1; } /* store away for deconf */ memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ); } snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX"); veth2 = lxc_mkifname(veth2buf); if (!veth2) { ERROR("failed to allocate a temporary name"); goto out_delete; } err = lxc_veth_create(veth1, veth2); if (err) { ERROR("failed to create %s-%s : %s", veth1, veth2, strerror(-err)); goto out_delete; } /* changing the high byte of the mac address to 0xfe, the bridge interface * will always keep the host's mac address and not take the mac address * of a container */ err = setup_private_host_hw_addr(veth1); if (err) { ERROR("failed to change mac address of host interface '%s' : %s", veth1, strerror(-err)); goto out_delete; } if (netdev->mtu) { err = lxc_netdev_set_mtu(veth1, atoi(netdev->mtu)); if (!err) err = lxc_netdev_set_mtu(veth2, atoi(netdev->mtu)); if (err) { ERROR("failed to set mtu '%s' for %s-%s : %s", netdev->mtu, veth1, veth2, strerror(-err)); goto out_delete; } } if (netdev->link) { err = lxc_bridge_attach(netdev->link, veth1); if (err) { ERROR("failed to attach '%s' to the bridge '%s' : %s", veth1, netdev->link, strerror(-err)); goto out_delete; } } netdev->ifindex = if_nametoindex(veth2); if (!netdev->ifindex) { ERROR("failed to retrieve the index for %s", veth2); goto out_delete; } err = lxc_netdev_up(veth1); if (err) { ERROR("failed to set %s up : %s", veth1, strerror(-err)); goto out_delete; } if (netdev->upscript) { err = run_script(handler->name, "net", netdev->upscript, "up", "veth", veth1, (char*) NULL); if (err) goto out_delete; } DEBUG("instanciated veth '%s/%s', index is '%d'", veth1, veth2, netdev->ifindex); return 0; out_delete: lxc_netdev_delete_by_name(veth1); if (!netdev->priv.veth_attr.pair && veth1) free(veth1); if(veth2) free(veth2); return -1; } static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev) { char *veth1; int err; if (netdev->priv.veth_attr.pair) veth1 = netdev->priv.veth_attr.pair; else veth1 = netdev->priv.veth_attr.veth1; if (netdev->downscript) { err = run_script(handler->name, "net", netdev->downscript, "down", "veth", veth1, (char*) NULL); if (err) return -1; } return 0; } static int instanciate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev) { char peerbuf[IFNAMSIZ], *peer; int err; if (!netdev->link) { ERROR("no link specified for macvlan netdev"); return -1; } err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX"); if (err >= sizeof(peerbuf)) return -1; peer = lxc_mkifname(peerbuf); if (!peer) { ERROR("failed to make a temporary name"); return -1; } err = lxc_macvlan_create(netdev->link, peer, netdev->priv.macvlan_attr.mode); if (err) { ERROR("failed to create macvlan interface '%s' on '%s' : %s", peer, netdev->link, strerror(-err)); goto out; } netdev->ifindex = if_nametoindex(peer); if (!netdev->ifindex) { ERROR("failed to retrieve the index for %s", peer); goto out; } if (netdev->upscript) { err = run_script(handler->name, "net", netdev->upscript, "up", "macvlan", netdev->link, (char*) NULL); if (err) goto out; } DEBUG("instanciated macvlan '%s', index is '%d' and mode '%d'", peer, netdev->ifindex, netdev->priv.macvlan_attr.mode); return 0; out: lxc_netdev_delete_by_name(peer); free(peer); return -1; } static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev) { int err; if (netdev->downscript) { err = run_script(handler->name, "net", netdev->downscript, "down", "macvlan", netdev->link, (char*) NULL); if (err) return -1; } return 0; } /* XXX: merge with instanciate_macvlan */ static int instanciate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev) { char peer[IFNAMSIZ]; int err; if (!netdev->link) { ERROR("no link specified for vlan netdev"); return -1; } err = snprintf(peer, sizeof(peer), "vlan%d", netdev->priv.vlan_attr.vid); if (err >= sizeof(peer)) { ERROR("peer name too long"); return -1; } err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid); if (err) { ERROR("failed to create vlan interface '%s' on '%s' : %s", peer, netdev->link, strerror(-err)); return -1; } netdev->ifindex = if_nametoindex(peer); if (!netdev->ifindex) { ERROR("failed to retrieve the ifindex for %s", peer); lxc_netdev_delete_by_name(peer); return -1; } DEBUG("instanciated vlan '%s', ifindex is '%d'", " vlan1000", netdev->ifindex); return 0; } static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev) { return 0; } static int instanciate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev) { if (!netdev->link) { ERROR("no link specified for the physical interface"); return -1; } netdev->ifindex = if_nametoindex(netdev->link); if (!netdev->ifindex) { ERROR("failed to retrieve the index for %s", netdev->link); return -1; } if (netdev->upscript) { int err; err = run_script(handler->name, "net", netdev->upscript, "up", "phys", netdev->link, (char*) NULL); if (err) return -1; } return 0; } static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev) { int err; if (netdev->downscript) { err = run_script(handler->name, "net", netdev->downscript, "down", "phys", netdev->link, (char*) NULL); if (err) return -1; } return 0; } static int instanciate_none(struct lxc_handler *handler, struct lxc_netdev *netdev) { netdev->ifindex = 0; return 0; } static int instanciate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev) { netdev->ifindex = 0; if (netdev->upscript) { int err; err = run_script(handler->name, "net", netdev->upscript, "up", "empty", (char*) NULL); if (err) return -1; } return 0; } static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev) { int err; if (netdev->downscript) { err = run_script(handler->name, "net", netdev->downscript, "down", "empty", (char*) NULL); if (err) return -1; } return 0; } static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev) { return 0; } int lxc_requests_empty_network(struct lxc_handler *handler) { struct lxc_list *network = &handler->conf->network; struct lxc_list *iterator; struct lxc_netdev *netdev; bool found_none = false, found_nic = false; if (lxc_list_empty(network)) return 0; lxc_list_for_each(iterator, network) { netdev = iterator->elem; if (netdev->type == LXC_NET_NONE) found_none = true; else found_nic = true; } if (found_none && !found_nic) return 1; return 0; } int lxc_create_network(struct lxc_handler *handler) { struct lxc_list *network = &handler->conf->network; struct lxc_list *iterator; struct lxc_netdev *netdev; int am_root = (getuid() == 0); if (!am_root) return 0; lxc_list_for_each(iterator, network) { netdev = iterator->elem; if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) { ERROR("invalid network configuration type '%d'", netdev->type); return -1; } if (netdev_conf[netdev->type](handler, netdev)) { ERROR("failed to create netdev"); return -1; } } return 0; } void lxc_delete_network(struct lxc_handler *handler) { struct lxc_list *network = &handler->conf->network; struct lxc_list *iterator; struct lxc_netdev *netdev; lxc_list_for_each(iterator, network) { netdev = iterator->elem; if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) { if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link)) WARN("failed to rename to the initial name the " \ "netdev '%s'", netdev->link); continue; } if (netdev_deconf[netdev->type](handler, netdev)) { WARN("failed to destroy netdev"); } /* Recent kernel remove the virtual interfaces when the network * namespace is destroyed but in case we did not moved the * interface to the network namespace, we have to destroy it */ if (netdev->ifindex != 0 && lxc_netdev_delete_by_index(netdev->ifindex)) WARN("failed to remove interface '%s'", netdev->name); } } #define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic" static int unpriv_assign_nic(struct lxc_netdev *netdev, pid_t pid) { pid_t child; if (netdev->type != LXC_NET_VETH) { ERROR("nic type %d not support for unprivileged use", netdev->type); return -1; } if ((child = fork()) < 0) { SYSERROR("fork"); return -1; } if (child > 0) return wait_for_pid(child); // Call lxc-user-nic pid type bridge char pidstr[20]; char *args[] = {LXC_USERNIC_PATH, pidstr, "veth", netdev->link, netdev->name, NULL }; snprintf(pidstr, 19, "%lu", (unsigned long) pid); pidstr[19] = '\0'; execvp(args[0], args); SYSERROR("execvp lxc-user-nic"); exit(1); } int lxc_assign_network(struct lxc_list *network, pid_t pid) { struct lxc_list *iterator; struct lxc_netdev *netdev; int am_root = (getuid() == 0); int err; lxc_list_for_each(iterator, network) { netdev = iterator->elem; if (netdev->type == LXC_NET_VETH && !am_root) { if (unpriv_assign_nic(netdev, pid)) return -1; // TODO fill in netdev->ifindex and name continue; } /* empty network namespace, nothing to move */ if (!netdev->ifindex) continue; err = lxc_netdev_move_by_index(netdev->ifindex, pid); if (err) { ERROR("failed to move '%s' to the container : %s", netdev->link, strerror(-err)); return -1; } DEBUG("move '%s' to '%d'", netdev->name, pid); } return 0; } static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf, size_t buf_size) { char path[PATH_MAX]; int ret, closeret; FILE *f; ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g'); if (ret < 0 || ret >= PATH_MAX) { fprintf(stderr, "%s: path name too long", __func__); return -E2BIG; } f = fopen(path, "w"); if (!f) { perror("open"); return -EINVAL; } ret = fwrite(buf, buf_size, 1, f); if (ret < 0) SYSERROR("writing id mapping"); closeret = fclose(f); if (closeret) SYSERROR("writing id mapping"); return ret < 0 ? ret : closeret; } int lxc_map_ids(struct lxc_list *idmap, pid_t pid) { struct lxc_list *iterator; struct id_map *map; int ret = 0; enum idtype type; char *buf = NULL, *pos; int am_root = (getuid() == 0); for(type = ID_TYPE_UID; type <= ID_TYPE_GID; type++) { int left, fill; int had_entry = 0; if (!buf) { buf = pos = malloc(4096); if (!buf) return -ENOMEM; } pos = buf; if (!am_root) pos += sprintf(buf, "new%cidmap %d", type == ID_TYPE_UID ? 'u' : 'g', pid); lxc_list_for_each(iterator, idmap) { /* The kernel only takes <= 4k for writes to /proc//[ug]id_map */ map = iterator->elem; if (map->idtype != type) continue; had_entry = 1; left = 4096 - (pos - buf); fill = snprintf(pos, left, "%s%lu %lu %lu%s", am_root ? "" : " ", map->nsid, map->hostid, map->range, am_root ? "\n" : ""); if (fill <= 0 || fill >= left) SYSERROR("snprintf failed, too many mappings"); pos += fill; } if (!had_entry) continue; if (am_root) { ret = write_id_mapping(type, pid, buf, pos-buf); } else { left = 4096 - (pos - buf); fill = snprintf(pos, left, "\n"); if (fill <= 0 || fill >= left) SYSERROR("snprintf failed, too many mappings"); pos += fill; ret = system(buf); } if (ret) break; } if (buf) free(buf); return ret; } /* * return the host uid to which the container root is mapped in *val. * Return true if id was found, false otherwise. */ bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype, unsigned long *val) { struct lxc_list *it; struct id_map *map; lxc_list_for_each(it, &conf->id_map) { map = it->elem; if (map->idtype != ID_TYPE_UID) continue; if (map->nsid != 0) continue; *val = map->hostid; return true; } return false; } int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype) { struct lxc_list *it; struct id_map *map; lxc_list_for_each(it, &conf->id_map) { map = it->elem; if (map->idtype != idtype) continue; if (id >= map->hostid && id < map->hostid + map->range) return (id - map->hostid) + map->nsid; } return -1; } int find_unmapped_nsuid(struct lxc_conf *conf, enum idtype idtype) { struct lxc_list *it; struct id_map *map; unsigned int freeid = 0; again: lxc_list_for_each(it, &conf->id_map) { map = it->elem; if (map->idtype != idtype) continue; if (freeid >= map->nsid && freeid < map->nsid + map->range) { freeid = map->nsid + map->range; goto again; } } return freeid; } int lxc_find_gateway_addresses(struct lxc_handler *handler) { struct lxc_list *network = &handler->conf->network; struct lxc_list *iterator; struct lxc_netdev *netdev; int link_index; lxc_list_for_each(iterator, network) { netdev = iterator->elem; if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto) continue; if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) { ERROR("gateway = auto only supported for " "veth and macvlan"); return -1; } if (!netdev->link) { ERROR("gateway = auto needs a link interface"); return -1; } link_index = if_nametoindex(netdev->link); if (!link_index) return -EINVAL; if (netdev->ipv4_gateway_auto) { if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) { ERROR("failed to automatically find ipv4 gateway " "address from link interface '%s'", netdev->link); return -1; } } if (netdev->ipv6_gateway_auto) { if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) { ERROR("failed to automatically find ipv6 gateway " "address from link interface '%s'", netdev->link); return -1; } } } return 0; } int lxc_create_tty(const char *name, struct lxc_conf *conf) { struct lxc_tty_info *tty_info = &conf->tty_info; int i, ret; /* no tty in the configuration */ if (!conf->tty) return 0; tty_info->pty_info = malloc(sizeof(*tty_info->pty_info)*conf->tty); if (!tty_info->pty_info) { SYSERROR("failed to allocate pty_info"); return -1; } for (i = 0; i < conf->tty; i++) { struct lxc_pty_info *pty_info = &tty_info->pty_info[i]; process_lock(); ret = openpty(&pty_info->master, &pty_info->slave, pty_info->name, NULL, NULL); process_unlock(); if (ret) { SYSERROR("failed to create pty #%d", i); tty_info->nbtty = i; lxc_delete_tty(tty_info); return -1; } DEBUG("allocated pty '%s' (%d/%d)", pty_info->name, pty_info->master, pty_info->slave); /* Prevent leaking the file descriptors to the container */ fcntl(pty_info->master, F_SETFD, FD_CLOEXEC); fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC); pty_info->busy = 0; } tty_info->nbtty = conf->tty; INFO("tty's configured"); return 0; } void lxc_delete_tty(struct lxc_tty_info *tty_info) { int i; for (i = 0; i < tty_info->nbtty; i++) { struct lxc_pty_info *pty_info = &tty_info->pty_info[i]; close(pty_info->master); close(pty_info->slave); } free(tty_info->pty_info); tty_info->nbtty = 0; } /* * chown_mapped_root: for an unprivileged user with uid X to chown a dir * to subuid Y, he needs to run chown as root in a userns where * nsid 0 is mapped to hostuid Y, and nsid Y is mapped to hostuid * X. That way, the container root is privileged with respect to * hostuid X, allowing him to do the chown. */ int chown_mapped_root(char *path, struct lxc_conf *conf) { uid_t rootid; pid_t pid; unsigned long val; if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) { ERROR("No mapping for container root"); return -1; } rootid = (uid_t) val; if (geteuid() == 0) { if (chown(path, rootid, -1) < 0) { ERROR("Error chowning %s", path); return -1; } return 0; } pid = fork(); if (pid < 0) { SYSERROR("Failed forking"); return -1; } if (!pid) { int hostuid = geteuid(), ret; char map1[100], map2[100], map3[100]; char *args[] = {"lxc-usernsexec", "-m", map1, "-m", map2, "-m", map3, "--", "chown", "0", path, NULL}; // "u:0:rootid:1" ret = snprintf(map1, 100, "u:0:%d:1", rootid); if (ret < 0 || ret >= 100) { ERROR("Error uid printing map string"); return -1; } // "u:hostuid:hostuid:1" ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid); if (ret < 0 || ret >= 100) { ERROR("Error uid printing map string"); return -1; } // "g:0:hostgid:1" ret = snprintf(map3, 100, "g:0:%d:1", getgid()); if (ret < 0 || ret >= 100) { ERROR("Error uid printing map string"); return -1; } ret = execvp("lxc-usernsexec", args); SYSERROR("Failed executing usernsexec"); exit(1); } return wait_for_pid(pid); } int ttys_shift_ids(struct lxc_conf *c) { int i; if (lxc_list_empty(&c->id_map)) return 0; for (i = 0; i < c->tty_info.nbtty; i++) { struct lxc_pty_info *pty_info = &c->tty_info.pty_info[i]; if (chown_mapped_root(pty_info->name, c) < 0) { ERROR("Failed to chown %s", pty_info->name); return -1; } } if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) { ERROR("Failed to chown %s", c->console.name); return -1; } return 0; } /* * This routine is called when the configuration does not already specify a value * for autodev (mounting a file system on /dev and populating it in a container). * If a hard override value has not be specified, then we try to apply some * heuristics to determine if we should switch to autodev mode. * * For instance, if the container has an /etc/systemd/system directory then it * is probably running systemd as the init process and it needs the autodev * mount to prevent it from mounting devtmpfs on /dev on it's own causing conflicts * in the host. * * We may also want to enable autodev if the host has devtmpfs mounted on its * /dev as this then enable us to use subdirectories under /dev for the container * /dev directories and we can fake udev devices. */ struct start_args { char *const *argv; }; #define MAX_SYMLINK_DEPTH 32 static int check_autodev( const char *rootfs, void *data ) { struct start_args *arg = data; int ret; int loop_count = 0; struct stat s; char absrootfs[MAXPATHLEN]; char path[MAXPATHLEN]; char abs_path[MAXPATHLEN]; char *command = "/sbin/init"; if (rootfs == NULL || strlen(rootfs) == 0) return -2; if (!realpath(rootfs, absrootfs)) return -2; if( arg && arg->argv[0] ) { command = arg->argv[0]; DEBUG("Set exec command to %s", command ); } strncpy( path, command, MAXPATHLEN-1 ); if ( 0 != access(path, F_OK) || 0 != stat(path, &s) ) return -2; /* Dereference down the symlink merry path testing as we go. */ /* If anything references systemd in the path - set autodev! */ /* Renormalize to the rootfs before each dereference */ /* Relative symlinks should fall out in the wash even with .. */ while( 1 ) { if ( strstr( path, "systemd" ) ) { INFO("Container with systemd init detected - enabling autodev!"); return 1; } ret = snprintf(abs_path, MAXPATHLEN-1, "%s/%s", absrootfs, path); if (ret < 0 || ret > MAXPATHLEN) return -2; ret = readlink( abs_path, path, MAXPATHLEN-1 ); if ( ( ret <= 0 ) || ( ++loop_count > MAX_SYMLINK_DEPTH ) ) { break; /* Break out for other tests */ } path[ret] = '\0'; } /* * Add future checks here. * Return positive if we should go autodev * Return 0 if we should NOT go autodev * Return negative if we encounter an error or can not determine... */ /* All else fails, we don't need autodev */ INFO("Autodev not required."); return 0; } int lxc_setup(struct lxc_handler *handler) { const char *name = handler->name; struct lxc_conf *lxc_conf = handler->conf; const char *lxcpath = handler->lxcpath; void *data = handler->data; if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) { if (setup_utsname(lxc_conf->utsname)) { ERROR("failed to setup the utsname for '%s'", name); return -1; } } if (setup_network(&lxc_conf->network)) { ERROR("failed to setup the network for '%s'", name); return -1; } if (run_lxc_hooks(name, "pre-mount", lxc_conf, lxcpath, NULL)) { ERROR("failed to run pre-mount hooks for container '%s'.", name); return -1; } if (setup_rootfs(lxc_conf)) { ERROR("failed to setup rootfs for '%s'", name); return -1; } if (lxc_conf->autodev < 0) { lxc_conf->autodev = check_autodev(lxc_conf->rootfs.mount, data); } if (lxc_conf->autodev > 0) { if (mount_autodev(name, lxc_conf->rootfs.mount, lxcpath)) { ERROR("failed to mount /dev in the container"); return -1; } } /* do automatic mounts (mainly /proc and /sys), but exclude * those that need to wait until other stuff has finished */ if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) { ERROR("failed to setup the automatic mounts for '%s'", name); return -1; } if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)) { ERROR("failed to setup the mounts for '%s'", name); return -1; } if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name)) { ERROR("failed to setup the mount entries for '%s'", name); return -1; } /* now mount only cgroup, if wanted; * before, /sys could not have been mounted * (is either mounted automatically or via fstab entries) */ if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) { ERROR("failed to setup the automatic mounts for '%s'", name); return -1; } if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) { ERROR("failed to run mount hooks for container '%s'.", name); return -1; } if (lxc_conf->autodev > 0) { if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) { ERROR("failed to run autodev hooks for container '%s'.", name); return -1; } if (setup_autodev(lxc_conf->rootfs.mount)) { ERROR("failed to populate /dev in the container"); return -1; } } if (!lxc_conf->is_execute && setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) { ERROR("failed to setup the console for '%s'", name); return -1; } if (lxc_conf->kmsg) { if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail ERROR("failed to setup kmsg for '%s'", name); } if (!lxc_conf->is_execute && setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) { ERROR("failed to setup the ttys for '%s'", name); return -1; } /* mount /proc if needed for LSM transition */ if (lsm_proc_mount(lxc_conf) < 0) { ERROR("failed to LSM mount proc for '%s'", name); return -1; } if (setup_pivot_root(&lxc_conf->rootfs)) { ERROR("failed to set rootfs for '%s'", name); return -1; } if (setup_pts(lxc_conf->pts)) { ERROR("failed to setup the new pts instance"); return -1; } if (setup_personality(lxc_conf->personality)) { ERROR("failed to setup personality"); return -1; } if (lxc_list_empty(&lxc_conf->id_map)) { if (!lxc_list_empty(&lxc_conf->keepcaps)) { if (!lxc_list_empty(&lxc_conf->caps)) { ERROR("Simultaneously requested dropping and keeping caps"); return -1; } if (dropcaps_except(&lxc_conf->keepcaps)) { ERROR("failed to keep requested caps"); return -1; } } else if (setup_caps(&lxc_conf->caps)) { ERROR("failed to drop capabilities"); return -1; } } NOTICE("'%s' is setup.", name); return 0; } int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf, const char *lxcpath, char *argv[]) { int which = -1; struct lxc_list *it; if (strcmp(hook, "pre-start") == 0) which = LXCHOOK_PRESTART; else if (strcmp(hook, "pre-mount") == 0) which = LXCHOOK_PREMOUNT; else if (strcmp(hook, "mount") == 0) which = LXCHOOK_MOUNT; else if (strcmp(hook, "autodev") == 0) which = LXCHOOK_AUTODEV; else if (strcmp(hook, "start") == 0) which = LXCHOOK_START; else if (strcmp(hook, "post-stop") == 0) which = LXCHOOK_POSTSTOP; else if (strcmp(hook, "clone") == 0) which = LXCHOOK_CLONE; else return -1; lxc_list_for_each(it, &conf->hooks[which]) { int ret; char *hookname = it->elem; ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv); if (ret) return ret; } return 0; } static void lxc_remove_nic(struct lxc_list *it) { struct lxc_netdev *netdev = it->elem; struct lxc_list *it2,*next; lxc_list_del(it); if (netdev->link) free(netdev->link); if (netdev->name) free(netdev->name); if (netdev->type == LXC_NET_VETH && netdev->priv.veth_attr.pair) free(netdev->priv.veth_attr.pair); if (netdev->upscript) free(netdev->upscript); if (netdev->hwaddr) free(netdev->hwaddr); if (netdev->mtu) free(netdev->mtu); if (netdev->ipv4_gateway) free(netdev->ipv4_gateway); if (netdev->ipv6_gateway) free(netdev->ipv6_gateway); lxc_list_for_each_safe(it2, &netdev->ipv4, next) { lxc_list_del(it2); free(it2->elem); free(it2); } lxc_list_for_each_safe(it2, &netdev->ipv6, next) { lxc_list_del(it2); free(it2->elem); free(it2); } free(netdev); free(it); } /* we get passed in something like '0', '0.ipv4' or '1.ipv6' */ int lxc_clear_nic(struct lxc_conf *c, const char *key) { char *p1; int ret, idx, i; struct lxc_list *it; struct lxc_netdev *netdev; p1 = index(key, '.'); if (!p1 || *(p1+1) == '\0') p1 = NULL; ret = sscanf(key, "%d", &idx); if (ret != 1) return -1; if (idx < 0) return -1; i = 0; lxc_list_for_each(it, &c->network) { if (i == idx) break; i++; } if (i < idx) // we don't have that many nics defined return -1; if (!it || !it->elem) return -1; netdev = it->elem; if (!p1) { lxc_remove_nic(it); } else if (strcmp(p1, ".ipv4") == 0) { struct lxc_list *it2,*next; lxc_list_for_each_safe(it2, &netdev->ipv4, next) { lxc_list_del(it2); free(it2->elem); free(it2); } } else if (strcmp(p1, ".ipv6") == 0) { struct lxc_list *it2,*next; lxc_list_for_each_safe(it2, &netdev->ipv6, next) { lxc_list_del(it2); free(it2->elem); free(it2); } } else if (strcmp(p1, ".link") == 0) { if (netdev->link) { free(netdev->link); netdev->link = NULL; } } else if (strcmp(p1, ".name") == 0) { if (netdev->name) { free(netdev->name); netdev->name = NULL; } } else if (strcmp(p1, ".script.up") == 0) { if (netdev->upscript) { free(netdev->upscript); netdev->upscript = NULL; } } else if (strcmp(p1, ".hwaddr") == 0) { if (netdev->hwaddr) { free(netdev->hwaddr); netdev->hwaddr = NULL; } } else if (strcmp(p1, ".mtu") == 0) { if (netdev->mtu) { free(netdev->mtu); netdev->mtu = NULL; } } else if (strcmp(p1, ".ipv4_gateway") == 0) { if (netdev->ipv4_gateway) { free(netdev->ipv4_gateway); netdev->ipv4_gateway = NULL; } } else if (strcmp(p1, ".ipv6_gateway") == 0) { if (netdev->ipv6_gateway) { free(netdev->ipv6_gateway); netdev->ipv6_gateway = NULL; } } else return -1; return 0; } int lxc_clear_config_network(struct lxc_conf *c) { struct lxc_list *it,*next; lxc_list_for_each_safe(it, &c->network, next) { lxc_remove_nic(it); } return 0; } int lxc_clear_config_caps(struct lxc_conf *c) { struct lxc_list *it,*next; lxc_list_for_each_safe(it, &c->caps, next) { lxc_list_del(it); free(it->elem); free(it); } return 0; } static int lxc_free_idmap(struct lxc_list *id_map) { struct lxc_list *it, *next; lxc_list_for_each_safe(it, id_map, next) { lxc_list_del(it); free(it->elem); free(it); } return 0; } int lxc_clear_idmaps(struct lxc_conf *c) { return lxc_free_idmap(&c->id_map); } int lxc_clear_config_keepcaps(struct lxc_conf *c) { struct lxc_list *it,*next; lxc_list_for_each_safe(it, &c->keepcaps, next) { lxc_list_del(it); free(it->elem); free(it); } return 0; } int lxc_clear_cgroups(struct lxc_conf *c, const char *key) { struct lxc_list *it,*next; bool all = false; const char *k = key + 11; if (strcmp(key, "lxc.cgroup") == 0) all = true; lxc_list_for_each_safe(it, &c->cgroup, next) { struct lxc_cgroup *cg = it->elem; if (!all && strcmp(cg->subsystem, k) != 0) continue; lxc_list_del(it); free(cg->subsystem); free(cg->value); free(cg); free(it); } return 0; } int lxc_clear_groups(struct lxc_conf *c) { struct lxc_list *it,*next; lxc_list_for_each_safe(it, &c->groups, next) { lxc_list_del(it); free(it->elem); free(it); } return 0; } int lxc_clear_mount_entries(struct lxc_conf *c) { struct lxc_list *it,*next; lxc_list_for_each_safe(it, &c->mount_list, next) { lxc_list_del(it); free(it->elem); free(it); } return 0; } int lxc_clear_hooks(struct lxc_conf *c, const char *key) { struct lxc_list *it,*next; bool all = false, done = false; const char *k = key + 9; int i; if (strcmp(key, "lxc.hook") == 0) all = true; for (i=0; ihooks[i], next) { lxc_list_del(it); free(it->elem); free(it); } done = true; } } if (!done) { ERROR("Invalid hook key: %s", key); return -1; } return 0; } static void lxc_clear_saved_nics(struct lxc_conf *conf) { int i; if (!conf->num_savednics) return; for (i=0; i < conf->num_savednics; i++) free(conf->saved_nics[i].orig_name); conf->saved_nics = 0; free(conf->saved_nics); } void lxc_conf_free(struct lxc_conf *conf) { if (!conf) return; if (conf->console.path) free(conf->console.path); if (conf->rootfs.mount) free(conf->rootfs.mount); if (conf->rootfs.options) free(conf->rootfs.options); if (conf->rootfs.path) free(conf->rootfs.path); if (conf->rootfs.pivot) free(conf->rootfs.pivot); if (conf->logfile) free(conf->logfile); if (conf->utsname) free(conf->utsname); if (conf->ttydir) free(conf->ttydir); if (conf->fstab) free(conf->fstab); if (conf->rcfile) free(conf->rcfile); lxc_clear_config_network(conf); if (conf->lsm_aa_profile) free(conf->lsm_aa_profile); if (conf->lsm_se_context) free(conf->lsm_se_context); lxc_seccomp_free(conf); lxc_clear_config_caps(conf); lxc_clear_config_keepcaps(conf); lxc_clear_cgroups(conf, "lxc.cgroup"); lxc_clear_hooks(conf, "lxc.hook"); lxc_clear_mount_entries(conf); lxc_clear_saved_nics(conf); lxc_clear_idmaps(conf); lxc_clear_groups(conf); free(conf); } struct userns_fn_data { int (*fn)(void *); void *arg; int p[2]; }; static int run_userns_fn(void *data) { struct userns_fn_data *d = data; char c; // we're not sharing with the parent any more, if it was a thread close(d->p[1]); if (read(d->p[0], &c, 1) != 1) return -1; close(d->p[0]); return d->fn(d->arg); } /* * Add a ID_TYPE_UID entry to an existing lxc_conf, if it is not * alread there. * We may want to generalize this to do gids as well as uids, but right now * it's not necessary. */ static struct lxc_list *idmap_add_id(struct lxc_conf *conf, uid_t uid) { int hostid_mapped = mapped_hostid(uid, conf, ID_TYPE_UID); struct lxc_list *new = NULL, *tmp, *it, *next; struct id_map *entry; new = malloc(sizeof(*new)); if (!new) { ERROR("Out of memory building id map"); return NULL; } lxc_list_init(new); if (hostid_mapped < 0) { hostid_mapped = find_unmapped_nsuid(conf, ID_TYPE_UID); if (hostid_mapped < 0) goto err; tmp = malloc(sizeof(*tmp)); if (!tmp) goto err; entry = malloc(sizeof(*entry)); if (!entry) { free(tmp); goto err; } tmp->elem = entry; entry->idtype = ID_TYPE_UID; entry->nsid = hostid_mapped; entry->hostid = (unsigned long)uid; entry->range = 1; lxc_list_add_tail(new, tmp); } lxc_list_for_each_safe(it, &conf->id_map, next) { tmp = malloc(sizeof(*tmp)); if (!tmp) goto err; entry = malloc(sizeof(*entry)); if (!entry) { free(tmp); goto err; } memset(entry, 0, sizeof(*entry)); memcpy(entry, it->elem, sizeof(*entry)); tmp->elem = entry; lxc_list_add_tail(new, tmp); } return new; err: ERROR("Out of memory building a new uid map"); if (new) lxc_free_idmap(new); free(new); return NULL; } /* * Run a function in a new user namespace. * The caller's euid will be mapped in if it is not already. */ int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data) { int ret, pid; struct userns_fn_data d; char c = '1'; int p[2]; struct lxc_list *idmap; ret = pipe(p); if (ret < 0) { SYSERROR("opening pipe"); return -1; } d.fn = fn; d.arg = data; d.p[0] = p[0]; d.p[1] = p[1]; pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER); if (pid < 0) goto err; close(p[0]); p[0] = -1; if ((idmap = idmap_add_id(conf, geteuid())) == NULL) { ERROR("Error adding self to container uid map"); goto err; } ret = lxc_map_ids(idmap, pid); lxc_free_idmap(idmap); free(idmap); if (ret) { ERROR("Error setting up child mappings"); goto err; } // kick the child if (write(p[1], &c, 1) != 1) { SYSERROR("writing to pipe to child"); goto err; } ret = wait_for_pid(pid); close(p[1]); return ret; err: if (p[0] != -1) close(p[0]); close(p[1]); return -1; }