diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in index 7b599e54d..eb3241f79 100644 --- a/doc/lxc.container.conf.sgml.in +++ b/doc/lxc.container.conf.sgml.in @@ -760,6 +760,18 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA container. This is useful to mount /etc, /var or /home for examples. + + NOTE - LXC will generally ensure that mount targets and relative + bind-mount sources are properly confined under the container + root, to avoid attacks involving over-mounting host directories + and files. (Symbolic links in absolute mount sources are ignored) + However, if the container configuration first mounts a directory which + is under the control of the container user, such as /home/joe, into + the container at some path, and then mounts + under path, then a TOCTTOU attack would be + possible where the container user modifies a symbolic link under + his home directory at just the right time. + diff --git a/src/lxc/cgfs.c b/src/lxc/cgfs.c index fcb3cde15..df2e6b233 100644 --- a/src/lxc/cgfs.c +++ b/src/lxc/cgfs.c @@ -1363,7 +1363,10 @@ static bool cgroupfs_mount_cgroup(void *hdata, const char *root, int type) if (!path) return false; snprintf(path, bufsz, "%s/sys/fs/cgroup", root); - r = mount("cgroup_root", path, "tmpfs", MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME, "size=10240k,mode=755"); + r = safe_mount("cgroup_root", path, "tmpfs", + MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME, + "size=10240k,mode=755", + root); if (r < 0) { SYSERROR("could not mount tmpfs to /sys/fs/cgroup in the container"); return false; diff --git a/src/lxc/cgmanager.c b/src/lxc/cgmanager.c index 8386d83ab..a677c22c7 100644 --- a/src/lxc/cgmanager.c +++ b/src/lxc/cgmanager.c @@ -1477,7 +1477,7 @@ static bool cgm_bind_dir(const char *root, const char *dirname) } /* mount a tmpfs there so we can create subdirs */ - if (mount("cgroup", cgpath, "tmpfs", 0, "size=10000,mode=755")) { + if (safe_mount("cgroup", cgpath, "tmpfs", 0, "size=10000,mode=755", root)) { SYSERROR("Failed to mount tmpfs at %s", cgpath); return false; } @@ -1488,7 +1488,7 @@ static bool cgm_bind_dir(const char *root, const char *dirname) return false; } - if (mount(dirname, cgpath, "none", MS_BIND, 0)) { + if (safe_mount(dirname, cgpath, "none", MS_BIND, 0, root)) { SYSERROR("Failed to bind mount %s to %s", dirname, cgpath); return false; } diff --git a/src/lxc/conf.c b/src/lxc/conf.c index 53057dea4..bb4c19f82 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -769,10 +769,11 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha * 2.6.32... */ { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL }, - { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/net", NULL, MS_BIND, NULL }, + /* proc/tty is used as a temporary placeholder for proc/sys/net which we'll move back in a few steps */ + { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys/net", "%r/proc/tty", NULL, MS_BIND, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL }, - { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/net", "%r/proc/sys/net", NULL, MS_MOVE, NULL }, + { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/tty", "%r/proc/sys/net", NULL, MS_MOVE, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL }, { LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL }, @@ -815,7 +816,7 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha } mflags = add_required_remount_flags(source, destination, default_mounts[i].flags); - r = mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options); + r = safe_mount(source, destination, default_mounts[i].fstype, mflags, default_mounts[i].options, conf->rootfs.path ? conf->rootfs.mount : NULL); saved_errno = errno; if (r < 0 && errno == ENOENT) { INFO("Mount source or target for %s on %s doesn't exist. Skipping.", source, destination); @@ -1167,7 +1168,8 @@ static int mount_autodev(const char *name, const struct lxc_rootfs *rootfs, cons return 0; } - if (mount("none", path, "tmpfs", 0, "size=100000,mode=755")) { + if (safe_mount("none", path, "tmpfs", 0, "size=100000,mode=755", + rootfs->path ? rootfs->mount : NULL)) { SYSERROR("Failed mounting tmpfs onto %s\n", path); return false; } @@ -1252,7 +1254,8 @@ static int fill_autodev(const struct lxc_rootfs *rootfs) return -1; } fclose(pathfile); - if (mount(hostpath, path, 0, MS_BIND, NULL) != 0) { + if (safe_mount(hostpath, path, 0, MS_BIND, NULL, + rootfs->path ? rootfs->mount : NULL) != 0) { SYSERROR("Failed bind mounting device %s from host into container", d->name); return -1; @@ -1505,7 +1508,7 @@ static int setup_dev_console(const struct lxc_rootfs *rootfs, return -1; } - if (mount(console->name, path, "none", MS_BIND, 0)) { + if (safe_mount(console->name, path, "none", MS_BIND, 0, rootfs->mount)) { ERROR("failed to mount '%s' on '%s'", console->name, path); return -1; } @@ -1560,7 +1563,7 @@ static int setup_ttydir_console(const struct lxc_rootfs *rootfs, return 0; } - if (mount(console->name, lxcpath, "none", MS_BIND, 0)) { + if (safe_mount(console->name, lxcpath, "none", MS_BIND, 0, rootfs->mount)) { ERROR("failed to mount '%s' on '%s'", console->name, lxcpath); return -1; } @@ -1710,13 +1713,13 @@ static char *get_field(char *src, int nfields) static int mount_entry(const char *fsname, const char *target, const char *fstype, unsigned long mountflags, - const char *data, int optional) + const char *data, int optional, const char *rootfs) { #ifdef HAVE_STATVFS struct statvfs sb; #endif - if (mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data)) { + if (safe_mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data, rootfs)) { if (optional) { INFO("failed to mount '%s' on '%s' (optional): %s", fsname, target, strerror(errno)); @@ -1763,7 +1766,7 @@ static int mount_entry(const char *fsname, const char *target, #endif if (mount(fsname, target, fstype, - mountflags | MS_REMOUNT, data)) { + mountflags | MS_REMOUNT, data) < 0) { if (optional) { INFO("failed to mount '%s' on '%s' (optional): %s", fsname, target, strerror(errno)); @@ -1843,7 +1846,7 @@ static int mount_entry_create_dir_file(const struct mntent *mntent, } static inline int mount_entry_on_generic(struct mntent *mntent, - const char* path) + const char* path, const char *rootfs) { unsigned long mntflags; char *mntdata; @@ -1863,7 +1866,7 @@ static inline int mount_entry_on_generic(struct mntent *mntent, } ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type, - mntflags, mntdata, optional); + mntflags, mntdata, optional, rootfs); free(mntdata); @@ -1872,7 +1875,7 @@ static inline int mount_entry_on_generic(struct mntent *mntent, static inline int mount_entry_on_systemfs(struct mntent *mntent) { - return mount_entry_on_generic(mntent, mntent->mnt_dir); + return mount_entry_on_generic(mntent, mntent->mnt_dir, NULL); } static int mount_entry_on_absolute_rootfs(struct mntent *mntent, @@ -1919,7 +1922,7 @@ skipabs: return -1; } - return mount_entry_on_generic(mntent, path); + return mount_entry_on_generic(mntent, path, rootfs->mount); } static int mount_entry_on_relative_rootfs(struct mntent *mntent, @@ -1935,7 +1938,7 @@ static int mount_entry_on_relative_rootfs(struct mntent *mntent, return -1; } - return mount_entry_on_generic(mntent, path); + return mount_entry_on_generic(mntent, path, rootfs); } static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file, @@ -3602,7 +3605,7 @@ void lxc_execute_bind_init(struct lxc_conf *conf) fclose(pathfile); } - ret = mount(path, destpath, "none", MS_BIND, NULL); + ret = safe_mount(path, destpath, "none", MS_BIND, NULL, conf->rootfs.mount); if (ret < 0) SYSERROR("Failed to bind lxc.init.static into container"); INFO("lxc.init.static bound into container at %s", path); diff --git a/src/lxc/utils.c b/src/lxc/utils.c index 8e7fc52f2..788cbe136 100644 --- a/src/lxc/utils.c +++ b/src/lxc/utils.c @@ -1404,6 +1404,239 @@ int setproctitle(char *title) return ret; } +/* + * @path: a pathname where / replaced with '\0'. + * @offsetp: pointer to int showing which path segment was last seen. + * Updated on return to reflect the next segment. + * @fulllen: full original path length. + * Returns a pointer to the next path segment, or NULL if done. + */ +static char *get_nextpath(char *path, int *offsetp, int fulllen) +{ + int offset = *offsetp; + + if (offset >= fulllen) + return NULL; + + while (path[offset] != '\0' && offset < fulllen) + offset++; + while (path[offset] == '\0' && offset < fulllen) + offset++; + + *offsetp = offset; + return (offset < fulllen) ? &path[offset] : NULL; +} + +/* + * Check that @subdir is a subdir of @dir. @len is the length of + * @dir (to avoid having to recalculate it). + */ +static bool is_subdir(const char *subdir, const char *dir, size_t len) +{ + size_t subdirlen = strlen(subdir); + + if (subdirlen < len) + return false; + if (strncmp(subdir, dir, len) != 0) + return false; + if (dir[len-1] == '/') + return true; + if (subdir[len] == '/' || subdirlen == len) + return true; + return false; +} + +/* + * Check if the open fd is a symlink. Return -ELOOP if it is. Return + * -ENOENT if we couldn't fstat. Return 0 if the fd is ok. + */ +static int check_symlink(int fd) +{ + struct stat sb; + int ret = fstat(fd, &sb); + if (ret < 0) + return -ENOENT; + if (S_ISLNK(sb.st_mode)) + return -ELOOP; + return 0; +} + +/* + * Open a file or directory, provided that it contains no symlinks. + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + */ +static int open_if_safe(int dirfd, const char *nextpath) +{ + int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW); + if (newfd >= 0) // was not a symlink, all good + return newfd; + + if (errno == ELOOP) + return newfd; + + if (errno == EPERM || errno == EACCES) { + /* we're not root (cause we got EPERM) so + try opening with O_PATH */ + newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW); + if (newfd >= 0) { + /* O_PATH will return an fd for symlinks. We know + * nextpath wasn't a symlink at last openat, so if fd + * is now a link, then something * fishy is going on + */ + int ret = check_symlink(newfd); + if (ret < 0) { + close(newfd); + newfd = ret; + } + } + } + + return newfd; +} + +/* + * Open a path intending for mounting, ensuring that the final path + * is inside the container's rootfs. + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + * + * @target: path to be opened + * @prefix_skip: a part of @target in which to ignore symbolic links. This + * would be the container's rootfs. + * + * Return an open fd for the path, or <0 on error. + */ +static int open_without_symlink(const char *target, const char *prefix_skip) +{ + int curlen = 0, dirfd, fulllen, i; + char *dup = NULL; + + fulllen = strlen(target); + + /* make sure prefix-skip makes sense */ + if (prefix_skip) { + curlen = strlen(prefix_skip); + if (!is_subdir(target, prefix_skip, curlen)) { + ERROR("WHOA there - target '%s' didn't start with prefix '%s'", + target, prefix_skip); + return -EINVAL; + } + /* + * get_nextpath() expects the curlen argument to be + * on a (turned into \0) / or before it, so decrement + * curlen to make sure that happens + */ + if (curlen) + curlen--; + } else { + prefix_skip = "/"; + curlen = 0; + } + + /* Make a copy of target which we can hack up, and tokenize it */ + if ((dup = strdup(target)) == NULL) { + SYSERROR("Out of memory checking for symbolic link"); + return -ENOMEM; + } + for (i = 0; i < fulllen; i++) { + if (dup[i] == '/') + dup[i] = '\0'; + } + + dirfd = open(prefix_skip, O_RDONLY); + if (dirfd < 0) + goto out; + while (1) { + int newfd, saved_errno; + char *nextpath; + + if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL) + goto out; + newfd = open_if_safe(dirfd, nextpath); + saved_errno = errno; + close(dirfd); + dirfd = newfd; + if (newfd < 0) { + errno = saved_errno; + if (errno == ELOOP) + SYSERROR("%s in %s was a symbolic link!", nextpath, target); + else + SYSERROR("Error examining %s in %s", nextpath, target); + goto out; + } + } + +out: + free(dup); + return dirfd; +} + +/* + * Safely mount a path into a container, ensuring that the mount target + * is under the container's @rootfs. (If @rootfs is NULL, then the container + * uses the host's /) + * + * CAVEAT: This function must not be used for other purposes than container + * setup before executing the container's init + */ +int safe_mount(const char *src, const char *dest, const char *fstype, + unsigned long flags, const void *data, const char *rootfs) +{ + int srcfd = -1, destfd, ret, saved_errno; + char srcbuf[50], destbuf[50]; // only needs enough for /proc/self/fd/ + const char *mntsrc = src; + + if (!rootfs) + rootfs = ""; + + /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */ + if (flags & MS_BIND && src && src[0] != '/') { + INFO("this is a relative bind mount"); + srcfd = open_without_symlink(src, NULL); + if (srcfd < 0) + return srcfd; + ret = snprintf(srcbuf, 50, "/proc/self/fd/%d", srcfd); + if (ret < 0 || ret > 50) { + close(srcfd); + ERROR("Out of memory"); + return -EINVAL; + } + mntsrc = srcbuf; + } + + destfd = open_without_symlink(dest, rootfs); + if (destfd < 0) { + if (srcfd != -1) + close(srcfd); + return destfd; + } + + ret = snprintf(destbuf, 50, "/proc/self/fd/%d", destfd); + if (ret < 0 || ret > 50) { + if (srcfd != -1) + close(srcfd); + close(destfd); + ERROR("Out of memory"); + return -EINVAL; + } + + ret = mount(mntsrc, destbuf, fstype, flags, data); + saved_errno = errno; + if (srcfd != -1) + close(srcfd); + close(destfd); + if (ret < 0) { + errno = saved_errno; + SYSERROR("Failed to mount %s onto %s", src, dest); + return ret; + } + + return 0; +} + /* * Mount a proc under @rootfs if proc self points to a pid other than * my own. This is needed to have a known-good proc mount for setting @@ -1446,7 +1679,7 @@ int mount_proc_if_needed(const char *rootfs) return 0; domount: - if (mount("proc", path, "proc", 0, NULL)) + if (safe_mount("proc", path, "proc", 0, NULL, rootfs) < 0) return -1; INFO("Mounted /proc in container for security transition"); return 1; diff --git a/src/lxc/utils.h b/src/lxc/utils.h index ee12dde45..059026f01 100644 --- a/src/lxc/utils.h +++ b/src/lxc/utils.h @@ -279,6 +279,8 @@ bool switch_to_ns(pid_t pid, const char *ns); int is_dir(const char *path); char *get_template_path(const char *t); int setproctitle(char *title); +int safe_mount(const char *src, const char *dest, const char *fstype, + unsigned long flags, const void *data, const char *rootfs); int mount_proc_if_needed(const char *rootfs); int null_stdfds(void); #endif /* __LXC_UTILS_H */ diff --git a/src/tests/Makefile.am b/src/tests/Makefile.am index 2355c845a..462d4f278 100644 --- a/src/tests/Makefile.am +++ b/src/tests/Makefile.am @@ -55,6 +55,7 @@ bin_SCRIPTS += \ lxc-test-apparmor-mount \ lxc-test-checkpoint-restore \ lxc-test-snapdeps \ + lxc-test-symlink \ lxc-test-ubuntu \ lxc-test-unpriv \ lxc-test-usernic @@ -82,6 +83,7 @@ EXTRA_DIST = \ lxc-test-cloneconfig \ lxc-test-createconfig \ lxc-test-snapdeps \ + lxc-test-symlink \ lxc-test-ubuntu \ lxc-test-unpriv \ may_control.c \ diff --git a/src/tests/lxc-test-symlink b/src/tests/lxc-test-symlink new file mode 100755 index 000000000..37320f0df --- /dev/null +++ b/src/tests/lxc-test-symlink @@ -0,0 +1,88 @@ +#!/bin/bash + +set -ex + +# lxc: linux Container library + +# Authors: +# Serge Hallyn +# +# This is a regression test for symbolic links + +dirname=`mktemp -d` +fname=`mktemp` +fname2=`mktemp` + +lxcpath=/var/lib/lxcsym1 + +cleanup() { + lxc-destroy -P $lxcpath -f -n symtest1 || true + rm -f $lxcpath + rmdir $dirname || true + rm -f $fname || true + rm -f $fname2 || true +} + +trap cleanup EXIT SIGHUP SIGINT SIGTERM + +testrun() { + expected=$1 + run=$2 + pass="pass" + lxc-start -P $lxcpath -n symtest1 -l trace -o $lxcpath/log || pass="fail" + [ $pass = "pass" ] && lxc-wait -P $lxcpath -n symtest1 -t 10 -s RUNNING || pass="fail" + if [ "$pass" != "$expected" ]; then + echo "Test $run: expected $expected but container did not. Start log:" + cat $lxcpath/log + echo "FAIL: Test $run: expected $expected but container did not." + false + fi + lxc-stop -P $lxcpath -n symtest1 -k || true +} + +# make lxcpath a symlink - this should NOT cause failure +ln -s /var/lib/lxc $lxcpath + +lxc-destroy -P $lxcpath -f -n symtest1 || true +lxc-create -P $lxcpath -t busybox -n symtest1 + +cat >> /var/lib/lxc/symtest1/config << EOF +lxc.mount.entry = $dirname opt/xxx/dir none bind,create=dir +lxc.mount.entry = $fname opt/xxx/file none bind,create=file +lxc.mount.entry = $fname2 opt/xxx/file2 none bind +EOF + +# Regular - should succeed +mkdir -p /var/lib/lxc/symtest1/rootfs/opt/xxx +touch /var/lib/lxc/symtest1/rootfs/opt/xxx/file2 +testrun pass 1 + +# symlink - should fail +rm -rf /var/lib/lxc/symtest1/rootfs/opt/xxx +mkdir -p /var/lib/lxc/symtest1/rootfs/opt/xxx2 +ln -s /var/lib/lxc/symtest1/rootfs/opt/xxx2 /var/lib/lxc/symtest1/rootfs/opt/xxx +touch /var/lib/lxc/symtest1/rootfs/opt/xxx/file2 +testrun fail 2 + +# final final symlink - should fail +rm -rf $lxcpath/symtest1/rootfs/opt/xxx +mkdir -p $lxcpath/symtest1/rootfs/opt/xxx +mkdir -p $lxcpath/symtest1/rootfs/opt/xxx/dir +touch $lxcpath/symtest1/rootfs/opt/xxx/file +touch $lxcpath/symtest1/rootfs/opt/xxx/file2src +ln -s $lxcpath/symtest1/rootfs/opt/xxx/file2src $lxcpath/symtest1/rootfs/opt/xxx/file2 +testrun fail 3 + +# Ideally we'd also try a loop device, but that won't work in nested containers +# anyway - TODO + +# what about /proc itself + +rm -rf $lxcpath/symtest1/rootfs/opt/xxx +mkdir -p $lxcpath/symtest1/rootfs/opt/xxx +touch $lxcpath/symtest1/rootfs/opt/xxx/file2 +mv $lxcpath/symtest1/rootfs/proc $lxcpath/symtest1/rootfs/proc1 +ln -s $lxcpath/symtest1/rootfs/proc1 $lxcpath/symtest1/rootfs/proc +testrun fail 4 + +echo "all tests passed"