mirror_lxc/src/lxc/conf.c
Serge Hallyn bbd23aa0bb warn about insufficient permissions
With this patch, if an unprivileged user has $HOME 700 or
750 and does

lxc-start -n c1

he'll see an error like:

lxc_container: Permission denied - could not access /home/serge.  Please grant it 'x' access, or add an ACL for t he container root.

(This addresses bug pad.lv/1277466)

Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Stéphane Graber <stgraber@ubuntu.com>
2014-02-10 21:20:08 -05:00

4138 lines
96 KiB
C

/*
* lxc: linux Container library
*
* (C) Copyright IBM Corp. 2007, 2008
*
* Authors:
* Daniel Lezcano <daniel.lezcano at free.fr>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "config.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdarg.h>
#include <errno.h>
#include <string.h>
#include <dirent.h>
#include <unistd.h>
#include <inttypes.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <time.h>
#if HAVE_PTY_H
#include <pty.h>
#else
#include <../include/openpty.h>
#endif
#include <linux/loop.h>
#include <sys/types.h>
#include <sys/utsname.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/mount.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <netinet/in.h>
#include <net/if.h>
#include <libgen.h>
#include "network.h"
#include "error.h"
#include "parse.h"
#include "utils.h"
#include "conf.h"
#include "log.h"
#include "caps.h" /* for lxc_caps_last_cap() */
#include "bdev.h"
#include "cgroup.h"
#include "lxclock.h"
#include "namespace.h"
#include "lsm/lsm.h"
#if HAVE_SYS_CAPABILITY_H
#include <sys/capability.h>
#endif
#if HAVE_SYS_PERSONALITY_H
#include <sys/personality.h>
#endif
#if IS_BIONIC
#include <../include/lxcmntent.h>
#else
#include <mntent.h>
#endif
#include "lxcseccomp.h"
lxc_log_define(lxc_conf, lxc);
#define MAXHWLEN 18
#define MAXINDEXLEN 20
#define MAXMTULEN 16
#define MAXLINELEN 128
#if HAVE_SYS_CAPABILITY_H
#ifndef CAP_SETFCAP
#define CAP_SETFCAP 31
#endif
#ifndef CAP_MAC_OVERRIDE
#define CAP_MAC_OVERRIDE 32
#endif
#ifndef CAP_MAC_ADMIN
#define CAP_MAC_ADMIN 33
#endif
#endif
#ifndef PR_CAPBSET_DROP
#define PR_CAPBSET_DROP 24
#endif
#ifndef LO_FLAGS_AUTOCLEAR
#define LO_FLAGS_AUTOCLEAR 4
#endif
/* Define pivot_root() if missing from the C library */
#ifndef HAVE_PIVOT_ROOT
static int pivot_root(const char * new_root, const char * put_old)
{
#ifdef __NR_pivot_root
return syscall(__NR_pivot_root, new_root, put_old);
#else
errno = ENOSYS;
return -1;
#endif
}
#else
extern int pivot_root(const char * new_root, const char * put_old);
#endif
/* Define sethostname() if missing from the C library */
#ifndef HAVE_SETHOSTNAME
static int sethostname(const char * name, size_t len)
{
#ifdef __NR_sethostname
return syscall(__NR_sethostname, name, len);
#else
errno = ENOSYS;
return -1;
#endif
}
#endif
/* Define __S_ISTYPE if missing from the C library */
#ifndef __S_ISTYPE
#define __S_ISTYPE(mode, mask) (((mode) & S_IFMT) == (mask))
#endif
char *lxchook_names[NUM_LXC_HOOKS] = {
"pre-start", "pre-mount", "mount", "autodev", "start", "post-stop", "clone" };
typedef int (*instanciate_cb)(struct lxc_handler *, struct lxc_netdev *);
struct mount_opt {
char *name;
int clear;
int flag;
};
struct caps_opt {
char *name;
int value;
};
static int instanciate_veth(struct lxc_handler *, struct lxc_netdev *);
static int instanciate_macvlan(struct lxc_handler *, struct lxc_netdev *);
static int instanciate_vlan(struct lxc_handler *, struct lxc_netdev *);
static int instanciate_phys(struct lxc_handler *, struct lxc_netdev *);
static int instanciate_empty(struct lxc_handler *, struct lxc_netdev *);
static int instanciate_none(struct lxc_handler *, struct lxc_netdev *);
static instanciate_cb netdev_conf[LXC_NET_MAXCONFTYPE + 1] = {
[LXC_NET_VETH] = instanciate_veth,
[LXC_NET_MACVLAN] = instanciate_macvlan,
[LXC_NET_VLAN] = instanciate_vlan,
[LXC_NET_PHYS] = instanciate_phys,
[LXC_NET_EMPTY] = instanciate_empty,
[LXC_NET_NONE] = instanciate_none,
};
static int shutdown_veth(struct lxc_handler *, struct lxc_netdev *);
static int shutdown_macvlan(struct lxc_handler *, struct lxc_netdev *);
static int shutdown_vlan(struct lxc_handler *, struct lxc_netdev *);
static int shutdown_phys(struct lxc_handler *, struct lxc_netdev *);
static int shutdown_empty(struct lxc_handler *, struct lxc_netdev *);
static int shutdown_none(struct lxc_handler *, struct lxc_netdev *);
static instanciate_cb netdev_deconf[LXC_NET_MAXCONFTYPE + 1] = {
[LXC_NET_VETH] = shutdown_veth,
[LXC_NET_MACVLAN] = shutdown_macvlan,
[LXC_NET_VLAN] = shutdown_vlan,
[LXC_NET_PHYS] = shutdown_phys,
[LXC_NET_EMPTY] = shutdown_empty,
[LXC_NET_NONE] = shutdown_none,
};
static struct mount_opt mount_opt[] = {
{ "defaults", 0, 0 },
{ "ro", 0, MS_RDONLY },
{ "rw", 1, MS_RDONLY },
{ "suid", 1, MS_NOSUID },
{ "nosuid", 0, MS_NOSUID },
{ "dev", 1, MS_NODEV },
{ "nodev", 0, MS_NODEV },
{ "exec", 1, MS_NOEXEC },
{ "noexec", 0, MS_NOEXEC },
{ "sync", 0, MS_SYNCHRONOUS },
{ "async", 1, MS_SYNCHRONOUS },
{ "dirsync", 0, MS_DIRSYNC },
{ "remount", 0, MS_REMOUNT },
{ "mand", 0, MS_MANDLOCK },
{ "nomand", 1, MS_MANDLOCK },
{ "atime", 1, MS_NOATIME },
{ "noatime", 0, MS_NOATIME },
{ "diratime", 1, MS_NODIRATIME },
{ "nodiratime", 0, MS_NODIRATIME },
{ "bind", 0, MS_BIND },
{ "rbind", 0, MS_BIND|MS_REC },
{ "relatime", 0, MS_RELATIME },
{ "norelatime", 1, MS_RELATIME },
{ "strictatime", 0, MS_STRICTATIME },
{ "nostrictatime", 1, MS_STRICTATIME },
{ NULL, 0, 0 },
};
#if HAVE_SYS_CAPABILITY_H
static struct caps_opt caps_opt[] = {
{ "chown", CAP_CHOWN },
{ "dac_override", CAP_DAC_OVERRIDE },
{ "dac_read_search", CAP_DAC_READ_SEARCH },
{ "fowner", CAP_FOWNER },
{ "fsetid", CAP_FSETID },
{ "kill", CAP_KILL },
{ "setgid", CAP_SETGID },
{ "setuid", CAP_SETUID },
{ "setpcap", CAP_SETPCAP },
{ "linux_immutable", CAP_LINUX_IMMUTABLE },
{ "net_bind_service", CAP_NET_BIND_SERVICE },
{ "net_broadcast", CAP_NET_BROADCAST },
{ "net_admin", CAP_NET_ADMIN },
{ "net_raw", CAP_NET_RAW },
{ "ipc_lock", CAP_IPC_LOCK },
{ "ipc_owner", CAP_IPC_OWNER },
{ "sys_module", CAP_SYS_MODULE },
{ "sys_rawio", CAP_SYS_RAWIO },
{ "sys_chroot", CAP_SYS_CHROOT },
{ "sys_ptrace", CAP_SYS_PTRACE },
{ "sys_pacct", CAP_SYS_PACCT },
{ "sys_admin", CAP_SYS_ADMIN },
{ "sys_boot", CAP_SYS_BOOT },
{ "sys_nice", CAP_SYS_NICE },
{ "sys_resource", CAP_SYS_RESOURCE },
{ "sys_time", CAP_SYS_TIME },
{ "sys_tty_config", CAP_SYS_TTY_CONFIG },
{ "mknod", CAP_MKNOD },
{ "lease", CAP_LEASE },
#ifdef CAP_AUDIT_WRITE
{ "audit_write", CAP_AUDIT_WRITE },
#endif
#ifdef CAP_AUDIT_CONTROL
{ "audit_control", CAP_AUDIT_CONTROL },
#endif
{ "setfcap", CAP_SETFCAP },
{ "mac_override", CAP_MAC_OVERRIDE },
{ "mac_admin", CAP_MAC_ADMIN },
#ifdef CAP_SYSLOG
{ "syslog", CAP_SYSLOG },
#endif
#ifdef CAP_WAKE_ALARM
{ "wake_alarm", CAP_WAKE_ALARM },
#endif
};
#else
static struct caps_opt caps_opt[] = {};
#endif
static int run_buffer(char *buffer)
{
struct lxc_popen_FILE *f;
char *output;
int ret;
f = lxc_popen(buffer);
if (!f) {
SYSERROR("popen failed");
return -1;
}
output = malloc(LXC_LOG_BUFFER_SIZE);
if (!output) {
ERROR("failed to allocate memory for script output");
lxc_pclose(f);
return -1;
}
while(fgets(output, LXC_LOG_BUFFER_SIZE, f->f))
DEBUG("script output: %s", output);
free(output);
ret = lxc_pclose(f);
if (ret == -1) {
SYSERROR("Script exited on error");
return -1;
} else if (WIFEXITED(ret) && WEXITSTATUS(ret) != 0) {
ERROR("Script exited with status %d", WEXITSTATUS(ret));
return -1;
} else if (WIFSIGNALED(ret)) {
ERROR("Script terminated by signal %d (%s)", WTERMSIG(ret),
strsignal(WTERMSIG(ret)));
return -1;
}
return 0;
}
static int run_script_argv(const char *name, const char *section,
const char *script, const char *hook, const char *lxcpath,
char **argsin)
{
int ret, i;
char *buffer;
size_t size = 0;
INFO("Executing script '%s' for container '%s', config section '%s'",
script, name, section);
for (i=0; argsin && argsin[i]; i++)
size += strlen(argsin[i]) + 1;
size += strlen(hook) + 1;
size += strlen(script);
size += strlen(name);
size += strlen(section);
size += 3;
if (size > INT_MAX)
return -1;
buffer = alloca(size);
if (!buffer) {
ERROR("failed to allocate memory");
return -1;
}
ret = snprintf(buffer, size, "%s %s %s %s", script, name, section, hook);
if (ret < 0 || ret >= size) {
ERROR("Script name too long");
return -1;
}
for (i=0; argsin && argsin[i]; i++) {
int len = size-ret;
int rc;
rc = snprintf(buffer + ret, len, " %s", argsin[i]);
if (rc < 0 || rc >= len) {
ERROR("Script args too long");
return -1;
}
ret += rc;
}
return run_buffer(buffer);
}
static int run_script(const char *name, const char *section,
const char *script, ...)
{
int ret;
char *buffer, *p;
size_t size = 0;
va_list ap;
INFO("Executing script '%s' for container '%s', config section '%s'",
script, name, section);
va_start(ap, script);
while ((p = va_arg(ap, char *)))
size += strlen(p) + 1;
va_end(ap);
size += strlen(script);
size += strlen(name);
size += strlen(section);
size += 3;
if (size > INT_MAX)
return -1;
buffer = alloca(size);
if (!buffer) {
ERROR("failed to allocate memory");
return -1;
}
ret = snprintf(buffer, size, "%s %s %s", script, name, section);
if (ret < 0 || ret >= size) {
ERROR("Script name too long");
return -1;
}
va_start(ap, script);
while ((p = va_arg(ap, char *))) {
int len = size-ret;
int rc;
rc = snprintf(buffer + ret, len, " %s", p);
if (rc < 0 || rc >= len) {
ERROR("Script args too long");
return -1;
}
ret += rc;
}
va_end(ap);
return run_buffer(buffer);
}
static int find_fstype_cb(char* buffer, void *data)
{
struct cbarg {
const char *rootfs;
const char *target;
const char *options;
} *cbarg = data;
unsigned long mntflags;
char *mntdata;
char *fstype;
/* we don't try 'nodev' entries */
if (strstr(buffer, "nodev"))
return 0;
fstype = buffer;
fstype += lxc_char_left_gc(fstype, strlen(fstype));
fstype[lxc_char_right_gc(fstype, strlen(fstype))] = '\0';
DEBUG("trying to mount '%s'->'%s' with fstype '%s'",
cbarg->rootfs, cbarg->target, fstype);
if (parse_mntopts(cbarg->options, &mntflags, &mntdata) < 0) {
free(mntdata);
return -1;
}
if (mount(cbarg->rootfs, cbarg->target, fstype, mntflags, mntdata)) {
DEBUG("mount failed with error: %s", strerror(errno));
free(mntdata);
return 0;
}
free(mntdata);
INFO("mounted '%s' on '%s', with fstype '%s'",
cbarg->rootfs, cbarg->target, fstype);
return 1;
}
static int mount_unknown_fs(const char *rootfs, const char *target,
const char *options)
{
int i;
struct cbarg {
const char *rootfs;
const char *target;
const char *options;
} cbarg = {
.rootfs = rootfs,
.target = target,
.options = options,
};
/*
* find the filesystem type with brute force:
* first we check with /etc/filesystems, in case the modules
* are auto-loaded and fall back to the supported kernel fs
*/
char *fsfile[] = {
"/etc/filesystems",
"/proc/filesystems",
};
for (i = 0; i < sizeof(fsfile)/sizeof(fsfile[0]); i++) {
int ret;
if (access(fsfile[i], F_OK))
continue;
ret = lxc_file_for_each_line(fsfile[i], find_fstype_cb, &cbarg);
if (ret < 0) {
ERROR("failed to parse '%s'", fsfile[i]);
return -1;
}
if (ret)
return 0;
}
ERROR("failed to determine fs type for '%s'", rootfs);
return -1;
}
static int mount_rootfs_dir(const char *rootfs, const char *target,
const char *options)
{
unsigned long mntflags;
char *mntdata;
int ret;
if (parse_mntopts(options, &mntflags, &mntdata) < 0) {
free(mntdata);
return -1;
}
ret = mount(rootfs, target, "none", MS_BIND | MS_REC | mntflags, mntdata);
free(mntdata);
return ret;
}
static int setup_lodev(const char *rootfs, int fd, struct loop_info64 *loinfo)
{
int rfd;
int ret = -1;
rfd = open(rootfs, O_RDWR);
if (rfd < 0) {
SYSERROR("failed to open '%s'", rootfs);
return -1;
}
memset(loinfo, 0, sizeof(*loinfo));
loinfo->lo_flags = LO_FLAGS_AUTOCLEAR;
if (ioctl(fd, LOOP_SET_FD, rfd)) {
SYSERROR("failed to LOOP_SET_FD");
goto out;
}
if (ioctl(fd, LOOP_SET_STATUS64, loinfo)) {
SYSERROR("failed to LOOP_SET_STATUS64");
goto out;
}
ret = 0;
out:
close(rfd);
return ret;
}
static int mount_rootfs_file(const char *rootfs, const char *target,
const char *options)
{
struct dirent dirent, *direntp;
struct loop_info64 loinfo;
int ret = -1, fd = -1, rc;
DIR *dir;
char path[MAXPATHLEN];
dir = opendir("/dev");
if (!dir) {
SYSERROR("failed to open '/dev'");
return -1;
}
while (!readdir_r(dir, &dirent, &direntp)) {
if (!direntp)
break;
if (!strcmp(direntp->d_name, "."))
continue;
if (!strcmp(direntp->d_name, ".."))
continue;
if (strncmp(direntp->d_name, "loop", 4))
continue;
rc = snprintf(path, MAXPATHLEN, "/dev/%s", direntp->d_name);
if (rc < 0 || rc >= MAXPATHLEN)
continue;
fd = open(path, O_RDWR);
if (fd < 0)
continue;
if (ioctl(fd, LOOP_GET_STATUS64, &loinfo) == 0) {
close(fd);
continue;
}
if (errno != ENXIO) {
WARN("unexpected error for ioctl on '%s': %m",
direntp->d_name);
close(fd);
continue;
}
DEBUG("found '%s' free lodev", path);
ret = setup_lodev(rootfs, fd, &loinfo);
if (!ret)
ret = mount_unknown_fs(path, target, options);
close(fd);
break;
}
if (closedir(dir))
WARN("failed to close directory");
return ret;
}
static int mount_rootfs_block(const char *rootfs, const char *target,
const char *options)
{
return mount_unknown_fs(rootfs, target, options);
}
/*
* pin_rootfs
* if rootfs is a directory, then open ${rootfs}/lxc.hold for writing for
* the duration of the container run, to prevent the container from marking
* the underlying fs readonly on shutdown. unlink the file immediately so
* no name pollution is happens
* return -1 on error.
* return -2 if nothing needed to be pinned.
* return an open fd (>=0) if we pinned it.
*/
int pin_rootfs(const char *rootfs)
{
char absrootfs[MAXPATHLEN];
char absrootfspin[MAXPATHLEN];
struct stat s;
int ret, fd;
if (rootfs == NULL || strlen(rootfs) == 0)
return -2;
if (!realpath(rootfs, absrootfs))
return -2;
if (access(absrootfs, F_OK))
return -1;
if (stat(absrootfs, &s))
return -1;
if (!S_ISDIR(s.st_mode))
return -2;
ret = snprintf(absrootfspin, MAXPATHLEN, "%s/lxc.hold", absrootfs);
if (ret >= MAXPATHLEN)
return -1;
fd = open(absrootfspin, O_CREAT | O_RDWR, S_IWUSR|S_IRUSR);
if (fd < 0)
return fd;
(void)unlink(absrootfspin);
return fd;
}
static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_handler *handler)
{
int r;
size_t i;
static struct {
int match_mask;
int match_flag;
const char *source;
const char *destination;
const char *fstype;
unsigned long flags;
const char *options;
} default_mounts[] = {
/* Read-only bind-mounting... In older kernels, doing that required
* to do one MS_BIND mount and then MS_REMOUNT|MS_RDONLY the same
* one. According to mount(2) manpage, MS_BIND honors MS_RDONLY from
* kernel 2.6.26 onwards. However, this apparently does not work on
* kernel 3.8. Unfortunately, on that very same kernel, doing the
* same trick as above doesn't seem to work either, there one needs
* to ALSO specify MS_BIND for the remount, otherwise the entire
* fs is remounted read-only or the mount fails because it's busy...
* MS_REMOUNT|MS_BIND|MS_RDONLY seems to work for kernels as low as
* 2.6.32...
*/
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sys", "%r/proc/sys", NULL, MS_BIND, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sys", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, "%r/proc/sysrq-trigger", "%r/proc/sysrq-trigger", NULL, MS_BIND, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED, NULL, "%r/proc/sysrq-trigger", NULL, MS_REMOUNT|MS_BIND|MS_RDONLY, NULL },
{ LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW, "proc", "%r/proc", "proc", MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL },
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW, "sysfs", "%r/sys", "sysfs", 0, NULL },
{ LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO, "sysfs", "%r/sys", "sysfs", MS_RDONLY, NULL },
{ 0, 0, NULL, NULL, NULL, 0, NULL }
};
for (i = 0; default_mounts[i].match_mask; i++) {
if ((flags & default_mounts[i].match_mask) == default_mounts[i].match_flag) {
char *source = NULL;
char *destination = NULL;
int saved_errno;
if (default_mounts[i].source) {
/* will act like strdup if %r is not present */
source = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].source);
if (!source) {
SYSERROR("memory allocation error");
return -1;
}
}
if (default_mounts[i].destination) {
/* will act like strdup if %r is not present */
destination = lxc_string_replace("%r", conf->rootfs.mount, default_mounts[i].destination);
if (!destination) {
saved_errno = errno;
SYSERROR("memory allocation error");
free(source);
errno = saved_errno;
return -1;
}
}
r = mount(source, destination, default_mounts[i].fstype, default_mounts[i].flags, default_mounts[i].options);
saved_errno = errno;
if (r < 0)
SYSERROR("error mounting %s on %s", source, destination);
free(source);
free(destination);
if (r < 0) {
errno = saved_errno;
return -1;
}
}
}
if (flags & LXC_AUTO_CGROUP_MASK) {
if (!cgroup_mount(conf->rootfs.mount, handler,
flags & LXC_AUTO_CGROUP_MASK)) {
SYSERROR("error mounting /sys/fs/cgroup");
return -1;
}
}
return 0;
}
static void print_top_failing_dir(const char *path)
{
size_t len = strlen(path);
char *copy = alloca(len+1), *p, *e, saved;
strcpy(copy, path);
p = copy;
e = copy + len;
while (p < e) {
while (p < e && *p == '/') p++;
while (p < e && *p != '/') p++;
if (p >= e)
return;
saved = *p;
*p = '\0';
if (access(copy, X_OK)) {
SYSERROR("could not access %s. Please grant it 'x' " \
"access, or add an ACL for the container root.",
copy);
return;
}
*p = saved;
}
}
static int mount_rootfs(const char *rootfs, const char *target, const char *options)
{
char absrootfs[MAXPATHLEN];
struct stat s;
int i;
typedef int (*rootfs_cb)(const char *, const char *, const char *);
struct rootfs_type {
int type;
rootfs_cb cb;
} rtfs_type[] = {
{ S_IFDIR, mount_rootfs_dir },
{ S_IFBLK, mount_rootfs_block },
{ S_IFREG, mount_rootfs_file },
};
if (!realpath(rootfs, absrootfs)) {
SYSERROR("failed to get real path for '%s'", rootfs);
return -1;
}
if (access(absrootfs, F_OK)) {
SYSERROR("'%s' is not accessible", absrootfs);
return -1;
}
if (stat(absrootfs, &s)) {
SYSERROR("failed to stat '%s'", absrootfs);
return -1;
}
for (i = 0; i < sizeof(rtfs_type)/sizeof(rtfs_type[0]); i++) {
if (!__S_ISTYPE(s.st_mode, rtfs_type[i].type))
continue;
return rtfs_type[i].cb(absrootfs, target, options);
}
ERROR("unsupported rootfs type for '%s'", absrootfs);
return -1;
}
static int setup_utsname(struct utsname *utsname)
{
if (!utsname)
return 0;
if (sethostname(utsname->nodename, strlen(utsname->nodename))) {
SYSERROR("failed to set the hostname to '%s'", utsname->nodename);
return -1;
}
INFO("'%s' hostname has been setup", utsname->nodename);
return 0;
}
static int setup_tty(const struct lxc_rootfs *rootfs,
const struct lxc_tty_info *tty_info, char *ttydir)
{
char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
int i, ret;
if (!rootfs->path)
return 0;
for (i = 0; i < tty_info->nbtty; i++) {
struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
ret = snprintf(path, sizeof(path), "%s/dev/tty%d",
rootfs->mount, i + 1);
if (ret >= sizeof(path)) {
ERROR("pathname too long for ttys");
return -1;
}
if (ttydir) {
/* create dev/lxc/tty%d" */
ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/tty%d",
rootfs->mount, ttydir, i + 1);
if (ret >= sizeof(lxcpath)) {
ERROR("pathname too long for ttys");
return -1;
}
ret = creat(lxcpath, 0660);
if (ret==-1 && errno != EEXIST) {
SYSERROR("error creating %s", lxcpath);
return -1;
}
if (ret >= 0)
close(ret);
ret = unlink(path);
if (ret && errno != ENOENT) {
SYSERROR("error unlinking %s", path);
return -1;
}
if (mount(pty_info->name, lxcpath, "none", MS_BIND, 0)) {
WARN("failed to mount '%s'->'%s'",
pty_info->name, path);
continue;
}
ret = snprintf(lxcpath, sizeof(lxcpath), "%s/tty%d", ttydir, i+1);
if (ret >= sizeof(lxcpath)) {
ERROR("tty pathname too long");
return -1;
}
ret = symlink(lxcpath, path);
if (ret) {
SYSERROR("failed to create symlink for tty %d", i+1);
return -1;
}
} else {
/* If we populated /dev, then we need to create /dev/ttyN */
if (access(path, F_OK)) {
ret = creat(path, 0660);
if (ret==-1) {
SYSERROR("error creating %s", path);
/* this isn't fatal, continue */
} else {
close(ret);
}
}
if (mount(pty_info->name, path, "none", MS_BIND, 0)) {
WARN("failed to mount '%s'->'%s'",
pty_info->name, path);
continue;
}
}
}
INFO("%d tty(s) has been setup", tty_info->nbtty);
return 0;
}
static int setup_rootfs_pivot_root_cb(char *buffer, void *data)
{
struct lxc_list *mountlist, *listentry, *iterator;
char *pivotdir, *mountpoint, *mountentry, *saveptr = NULL;
int found;
void **cbparm;
mountentry = buffer;
cbparm = (void **)data;
mountlist = cbparm[0];
pivotdir = cbparm[1];
/* parse entry, first field is mountname, ignore */
mountpoint = strtok_r(mountentry, " ", &saveptr);
if (!mountpoint)
return -1;
/* second field is mountpoint */
mountpoint = strtok_r(NULL, " ", &saveptr);
if (!mountpoint)
return -1;
/* only consider mountpoints below old root fs */
if (strncmp(mountpoint, pivotdir, strlen(pivotdir)))
return 0;
/* filter duplicate mountpoints */
found = 0;
lxc_list_for_each(iterator, mountlist) {
if (!strcmp(iterator->elem, mountpoint)) {
found = 1;
break;
}
}
if (found)
return 0;
/* add entry to list */
listentry = malloc(sizeof(*listentry));
if (!listentry) {
SYSERROR("malloc for mountpoint listentry failed");
return -1;
}
listentry->elem = strdup(mountpoint);
if (!listentry->elem) {
SYSERROR("strdup failed");
free(listentry);
return -1;
}
lxc_list_add_tail(mountlist, listentry);
return 0;
}
static int umount_oldrootfs(const char *oldrootfs)
{
char path[MAXPATHLEN];
void *cbparm[2];
struct lxc_list mountlist, *iterator, *next;
int ok, still_mounted, last_still_mounted;
int rc;
/* read and parse /proc/mounts in old root fs */
lxc_list_init(&mountlist);
/* oldrootfs is on the top tree directory now */
rc = snprintf(path, sizeof(path), "/%s", oldrootfs);
if (rc >= sizeof(path)) {
ERROR("rootfs name too long");
return -1;
}
cbparm[0] = &mountlist;
cbparm[1] = strdup(path);
if (!cbparm[1]) {
SYSERROR("strdup failed");
return -1;
}
rc = snprintf(path, sizeof(path), "%s/proc/mounts", oldrootfs);
if (rc >= sizeof(path)) {
ERROR("container proc/mounts name too long");
return -1;
}
ok = lxc_file_for_each_line(path,
setup_rootfs_pivot_root_cb, &cbparm);
if (ok < 0) {
SYSERROR("failed to read or parse mount list '%s'", path);
return -1;
}
/* umount filesystems until none left or list no longer shrinks */
still_mounted = 0;
do {
last_still_mounted = still_mounted;
still_mounted = 0;
lxc_list_for_each_safe(iterator, &mountlist, next) {
/* umount normally */
if (!umount(iterator->elem)) {
DEBUG("umounted '%s'", (char *)iterator->elem);
lxc_list_del(iterator);
continue;
}
still_mounted++;
}
} while (still_mounted > 0 && still_mounted != last_still_mounted);
lxc_list_for_each(iterator, &mountlist) {
/* let's try a lazy umount */
if (!umount2(iterator->elem, MNT_DETACH)) {
INFO("lazy unmount of '%s'", (char *)iterator->elem);
continue;
}
/* be more brutal (nfs) */
if (!umount2(iterator->elem, MNT_FORCE)) {
INFO("forced unmount of '%s'", (char *)iterator->elem);
continue;
}
WARN("failed to unmount '%s'", (char *)iterator->elem);
}
return 0;
}
static int setup_rootfs_pivot_root(const char *rootfs, const char *pivotdir)
{
char path[MAXPATHLEN];
int remove_pivotdir = 0;
int rc;
/* change into new root fs */
if (chdir(rootfs)) {
SYSERROR("can't chdir to new rootfs '%s'", rootfs);
return -1;
}
if (!pivotdir)
pivotdir = "lxc_putold";
/* compute the full path to pivotdir under rootfs */
rc = snprintf(path, sizeof(path), "%s/%s", rootfs, pivotdir);
if (rc >= sizeof(path)) {
ERROR("pivot dir name too long");
return -1;
}
if (access(path, F_OK)) {
if (mkdir_p(path, 0755)) {
SYSERROR("failed to create pivotdir '%s'", path);
return -1;
}
remove_pivotdir = 1;
DEBUG("created '%s' directory", path);
}
DEBUG("mountpoint for old rootfs is '%s'", path);
/* pivot_root into our new root fs */
if (pivot_root(".", path)) {
SYSERROR("pivot_root syscall failed");
return -1;
}
if (chdir("/")) {
SYSERROR("can't chdir to / after pivot_root");
return -1;
}
DEBUG("pivot_root syscall to '%s' successful", rootfs);
/* we switch from absolute path to relative path */
if (umount_oldrootfs(pivotdir))
return -1;
/* remove temporary mount point, we don't consider the removing
* as fatal */
if (remove_pivotdir && rmdir(pivotdir))
WARN("can't remove mountpoint '%s': %m", pivotdir);
return 0;
}
/*
* Note: This is a verbatum copy of what is in monitor.c. We're just
* usint it here to generate a safe subdirectory in /dev/ for the
* containers /dev/
*/
/* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS.
* FNV has good anti collision properties and we're not worried
* about pre-image resistance or one-way-ness, we're just trying to make
* the name unique in the 108 bytes of space we have.
*/
#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL)
static uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval)
{
unsigned char *bp;
for(bp = buf; bp < (unsigned char *)buf + len; bp++)
{
/* xor the bottom with the current octet */
hval ^= (uint64_t)*bp;
/* gcc optimised:
* multiply by the 64 bit FNV magic prime mod 2^64
*/
hval += (hval << 1) + (hval << 4) + (hval << 5) +
(hval << 7) + (hval << 8) + (hval << 40);
}
return hval;
}
/*
* Check to see if a directory has something mounted on it and,
* if it does, return the fstype.
*
* Code largely based on detect_shared_rootfs below
*
* Returns: # of matching entries in /proc/self/mounts
* if != 0 fstype is filled with the last filesystem value.
* if == 0 no matches found, fstype unchanged.
*
* ToDo: Maybe return the mount options in another parameter...
*/
#define LINELEN 4096
#define MAX_FSTYPE_LEN 128
static int mount_check_fs( const char *dir, char *fstype )
{
char buf[LINELEN], *p;
struct stat s;
FILE *f;
int found_fs = 0;
char *p2;
DEBUG("entering mount_check_fs for %s", dir);
if ( 0 != access(dir, F_OK) || 0 != stat(dir, &s) || 0 == S_ISDIR(s.st_mode) ) {
return 0;
}
f = fopen("/proc/self/mounts", "r");
if (!f)
return 0;
while ((p = fgets(buf, LINELEN, f))) {
p = index(buf, ' ');
if( !p )
continue;
*p = '\0';
p2 = p + 1;
p = index(p2, ' ');
if( !p )
continue;
*p = '\0';
/* Compare the directory in the entry to desired */
if( strcmp( p2, dir ) ) {
continue;
}
p2 = p + 1;
p = index( p2, ' ');
if( !p )
continue;
*p = '\0';
++found_fs;
if( fstype ) {
strncpy( fstype, p2, MAX_FSTYPE_LEN - 1 );
fstype [ MAX_FSTYPE_LEN - 1 ] = '\0';
}
}
fclose(f);
DEBUG("mount_check_fs returning %d last %s", found_fs, fstype);
return found_fs;
}
/*
* Locate a devtmpfs mount (should be on /dev) and create a container
* subdirectory on it which we can then bind mount to the container
* /dev instead of mounting a tmpfs there.
* If we fail, return NULL.
* Else return the pointer to the name buffer with the string to
* the devtmpfs subdirectory.
*/
static char *mk_devtmpfs(const char *name, char *path, const char *lxcpath)
{
int ret;
struct stat s;
char tmp_path[MAXPATHLEN];
char fstype[MAX_FSTYPE_LEN];
char *base_path = "/dev/.lxc";
char *user_path = "/dev/.lxc/user";
uint64_t hash;
if ( 0 != access(base_path, F_OK) || 0 != stat(base_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
/* This is just making /dev/.lxc it better work or we're done */
ret = mkdir(base_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
if ( ret ) {
SYSERROR( "Unable to create /dev/.lxc for autodev" );
return NULL;
}
}
/*
* Programmers notes:
* We can not do mounts in this area of code that we want
* to be visible in the host. Consequently, /dev/.lxc must
* be set up earlier if we need a tmpfs mounted there.
* That only affects the rare cases where autodev is enabled
* for a container and devtmpfs is not mounted on /dev in the
* host. In that case, we'll fall back to the old method
* of mounting a tmpfs in the container and have no visibility
* into the container /dev.
*/
if( ! mount_check_fs( "/dev", fstype )
|| strcmp( "devtmpfs", fstype ) ) {
/* Either /dev was not mounted or was not devtmpfs */
if ( ! mount_check_fs( "/dev/.lxc", NULL ) ) {
/*
* /dev/.lxc is not already mounted
* Doing a mount here does no good, since
* it's not visible in the host.
*/
ERROR("/dev/.lxc is not setup - taking fallback" );
return NULL;
}
}
if ( 0 != access(user_path, F_OK) || 0 != stat(user_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
/*
* This is making /dev/.lxc/user path for non-priv users.
* If this doesn't work, we'll have to fall back in the
* case of non-priv users. It's mode 1777 like /tmp.
*/
ret = mkdir(user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX);
if ( ret ) {
/* Issue an error but don't fail yet! */
ERROR("Unable to create /dev/.lxc/user");
}
/* Umask tends to screw us up here */
chmod(user_path, S_IRWXU | S_IRWXG | S_IRWXO | S_ISVTX);
}
/*
* Since the container name must be unique within a given
* lxcpath, we're going to use a hash of the path
* /lxcpath/name as our hash name in /dev/.lxc/
*/
ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s", lxcpath, name);
if (ret < 0 || ret >= MAXPATHLEN)
return NULL;
hash = fnv_64a_buf(tmp_path, ret, FNV1A_64_INIT);
ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, base_path, name, hash);
if (ret < 0 || ret >= MAXPATHLEN)
return NULL;
if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
if ( ret ) {
/* Something must have failed with the base_path...
* Maybe unpriv user. Try user_path now... */
INFO("Setup in /dev/.lxc failed. Trying /dev/.lxc/user." );
ret = snprintf(tmp_path, MAXPATHLEN, "%s/%s.%016" PRIx64, user_path, name, hash);
if (ret < 0 || ret >= MAXPATHLEN)
return NULL;
if ( 0 != access(tmp_path, F_OK) || 0 != stat(tmp_path, &s) || 0 == S_ISDIR(s.st_mode) ) {
ret = mkdir(tmp_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
if ( ret ) {
ERROR("Container /dev setup in host /dev failed - taking fallback" );
return NULL;
}
}
}
}
strcpy( path, tmp_path );
return path;
}
/*
* Do we want to add options for max size of /dev and a file to
* specify which devices to create?
*/
static int mount_autodev(const char *name, char *root, const char *lxcpath)
{
int ret;
struct stat s;
char path[MAXPATHLEN];
char host_path[MAXPATHLEN];
char devtmpfs_path[MAXPATHLEN];
INFO("Mounting /dev under %s", root);
ret = snprintf(host_path, MAXPATHLEN, "%s/%s/rootfs.dev", lxcpath, name);
if (ret < 0 || ret > MAXPATHLEN)
return -1;
ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
if (ret < 0 || ret > MAXPATHLEN)
return -1;
if (mk_devtmpfs( name, devtmpfs_path, lxcpath ) ) {
/*
* Get rid of old links and directoriess
* This could be either a symlink and we remove it,
* or an empty directory and we remove it,
* or non-existant and we don't care,
* or a non-empty directory, and we will then emit an error
* but we will not fail out the process.
*/
unlink( host_path );
rmdir( host_path );
ret = symlink(devtmpfs_path, host_path);
if ( ret < 0 ) {
SYSERROR("WARNING: Failed to create symlink '%s'->'%s'", host_path, devtmpfs_path);
}
DEBUG("Bind mounting %s to %s", devtmpfs_path , path );
ret = mount(devtmpfs_path, path, NULL, MS_BIND, 0 );
} else {
/* Only mount a tmpfs on here if we don't already a mount */
if ( ! mount_check_fs( host_path, NULL ) ) {
DEBUG("Mounting tmpfs to %s", host_path );
ret = mount("none", path, "tmpfs", 0, "size=100000,mode=755");
} else {
/* This allows someone to manually set up a mount */
DEBUG("Bind mounting %s to %s", host_path, path );
ret = mount(host_path , path, NULL, MS_BIND, 0 );
}
}
if (ret) {
SYSERROR("Failed to mount /dev at %s", root);
return -1;
}
ret = snprintf(path, MAXPATHLEN, "%s/dev/pts", root);
if (ret < 0 || ret >= MAXPATHLEN)
return -1;
/*
* If we are running on a devtmpfs mapping, dev/pts may already exist.
* If not, then create it and exit if that fails...
*/
if ( 0 != access(path, F_OK) || 0 != stat(path, &s) || 0 == S_ISDIR(s.st_mode) ) {
ret = mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH);
if (ret) {
SYSERROR("Failed to create /dev/pts in container");
return -1;
}
}
INFO("Mounted /dev under %s", root);
return 0;
}
struct lxc_devs {
const char *name;
mode_t mode;
int maj;
int min;
};
static const struct lxc_devs lxc_devs[] = {
{ "null", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 3 },
{ "zero", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 5 },
{ "full", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 7 },
{ "urandom", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 9 },
{ "random", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 1, 8 },
{ "tty", S_IFCHR | S_IRWXU | S_IRWXG | S_IRWXO, 5, 0 },
{ "console", S_IFCHR | S_IRUSR | S_IWUSR, 5, 1 },
};
static int setup_autodev(const char *root)
{
int ret;
char path[MAXPATHLEN];
int i;
mode_t cmask;
INFO("Creating initial consoles under %s/dev", root);
ret = snprintf(path, MAXPATHLEN, "%s/dev", root);
if (ret < 0 || ret >= MAXPATHLEN) {
ERROR("Error calculating container /dev location");
return -1;
}
INFO("Populating /dev under %s", root);
cmask = umask(S_IXUSR | S_IXGRP | S_IXOTH);
for (i = 0; i < sizeof(lxc_devs) / sizeof(lxc_devs[0]); i++) {
const struct lxc_devs *d = &lxc_devs[i];
ret = snprintf(path, MAXPATHLEN, "%s/dev/%s", root, d->name);
if (ret < 0 || ret >= MAXPATHLEN)
return -1;
ret = mknod(path, d->mode, makedev(d->maj, d->min));
if (ret && errno != EEXIST) {
SYSERROR("Error creating %s", d->name);
return -1;
}
}
umask(cmask);
INFO("Populated /dev under %s", root);
return 0;
}
/*
* Detect whether / is mounted MS_SHARED. The only way I know of to
* check that is through /proc/self/mountinfo.
* I'm only checking for /. If the container rootfs or mount location
* is MS_SHARED, but not '/', then you're out of luck - figuring that
* out would be too much work to be worth it.
*/
#define LINELEN 4096
int detect_shared_rootfs(void)
{
char buf[LINELEN], *p;
FILE *f;
int i;
char *p2;
f = fopen("/proc/self/mountinfo", "r");
if (!f)
return 0;
while ((p = fgets(buf, LINELEN, f))) {
for (p = buf, i=0; p && i < 4; i++)
p = index(p+1, ' ');
if (!p)
continue;
p2 = index(p+1, ' ');
if (!p2)
continue;
*p2 = '\0';
if (strcmp(p+1, "/") == 0) {
// this is '/'. is it shared?
p = index(p2+1, ' ');
if (p && strstr(p, "shared:")) {
fclose(f);
return 1;
}
}
}
fclose(f);
return 0;
}
/*
* I'll forgive you for asking whether all of this is needed :) The
* answer is yes.
* pivot_root will fail if the new root, the put_old dir, or the parent
* of current->fs->root are MS_SHARED. (parent of current->fs_root may
* or may not be current->fs_root - if we assumed it always was, we could
* just mount --make-rslave /). So,
* 1. mount a tiny tmpfs to be parent of current->fs->root.
* 2. make that MS_SLAVE
* 3. make a 'root' directory under that
* 4. mount --rbind / under the $tinyroot/root.
* 5. make that rslave
* 6. chdir and chroot into $tinyroot/root
* 7. $tinyroot will be unmounted by our parent in start.c
*/
static int chroot_into_slave(struct lxc_conf *conf)
{
char path[MAXPATHLEN];
const char *destpath = conf->rootfs.mount;
int ret;
if (mount(destpath, destpath, NULL, MS_BIND, 0)) {
SYSERROR("failed to mount %s bind", destpath);
return -1;
}
if (mount("", destpath, NULL, MS_SLAVE, 0)) {
SYSERROR("failed to make %s slave", destpath);
return -1;
}
if (mount("none", destpath, "tmpfs", 0, "size=10000,mode=755")) {
SYSERROR("Failed to mount tmpfs / at %s", destpath);
return -1;
}
ret = snprintf(path, MAXPATHLEN, "%s/root", destpath);
if (ret < 0 || ret >= MAXPATHLEN) {
ERROR("out of memory making root path");
return -1;
}
if (mkdir(path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
SYSERROR("Failed to create /dev/pts in container");
return -1;
}
if (mount("/", path, NULL, MS_BIND|MS_REC, 0)) {
SYSERROR("Failed to rbind mount / to %s", path);
return -1;
}
if (mount("", destpath, NULL, MS_SLAVE|MS_REC, 0)) {
SYSERROR("Failed to make tmp-/ at %s rslave", path);
return -1;
}
if (chdir(path)) {
SYSERROR("Failed to chdir into tmp-/");
return -1;
}
if (chroot(path)) {
SYSERROR("Failed to chroot into tmp-/");
return -1;
}
INFO("Chrooted into tmp-/ at %s", path);
return 0;
}
static int setup_rootfs(struct lxc_conf *conf)
{
const struct lxc_rootfs *rootfs = &conf->rootfs;
if (!rootfs->path) {
if (mount("", "/", NULL, MS_SLAVE|MS_REC, 0)) {
SYSERROR("Failed to make / rslave");
return -1;
}
return 0;
}
if (access(rootfs->mount, F_OK)) {
SYSERROR("failed to access to '%s', check it is present",
rootfs->mount);
return -1;
}
if (access(rootfs->path, R_OK)) {
print_top_failing_dir(rootfs->path);
return -1;
}
if (detect_shared_rootfs()) {
if (chroot_into_slave(conf)) {
ERROR("Failed to chroot into slave /");
return -1;
}
}
// First try mounting rootfs using a bdev
struct bdev *bdev = bdev_init(rootfs->path, rootfs->mount, rootfs->options);
if (bdev && bdev->ops->mount(bdev) == 0) {
bdev_put(bdev);
DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
return 0;
}
if (bdev)
bdev_put(bdev);
if (mount_rootfs(rootfs->path, rootfs->mount, rootfs->options)) {
ERROR("failed to mount rootfs");
return -1;
}
DEBUG("mounted '%s' on '%s'", rootfs->path, rootfs->mount);
return 0;
}
static int setup_pivot_root(const struct lxc_rootfs *rootfs)
{
if (!rootfs->path)
return 0;
if (setup_rootfs_pivot_root(rootfs->mount, rootfs->pivot)) {
ERROR("failed to setup pivot root");
return -1;
}
return 0;
}
static int setup_pts(int pts)
{
char target[PATH_MAX];
if (!pts)
return 0;
if (!access("/dev/pts/ptmx", F_OK) && umount("/dev/pts")) {
SYSERROR("failed to umount 'dev/pts'");
return -1;
}
if (mount("devpts", "/dev/pts", "devpts", MS_MGC_VAL,
"newinstance,ptmxmode=0666,mode=0620,gid=5")) {
SYSERROR("failed to mount a new instance of '/dev/pts'");
return -1;
}
if (access("/dev/ptmx", F_OK)) {
if (!symlink("/dev/pts/ptmx", "/dev/ptmx"))
goto out;
SYSERROR("failed to symlink '/dev/pts/ptmx'->'/dev/ptmx'");
return -1;
}
if (realpath("/dev/ptmx", target) && !strcmp(target, "/dev/pts/ptmx"))
goto out;
/* fallback here, /dev/pts/ptmx exists just mount bind */
if (mount("/dev/pts/ptmx", "/dev/ptmx", "none", MS_BIND, 0)) {
SYSERROR("mount failed '/dev/pts/ptmx'->'/dev/ptmx'");
return -1;
}
INFO("created new pts instance");
out:
return 0;
}
static int setup_personality(int persona)
{
#if HAVE_SYS_PERSONALITY_H
if (persona == -1)
return 0;
if (personality(persona) < 0) {
SYSERROR("failed to set personality to '0x%x'", persona);
return -1;
}
INFO("set personality to '0x%x'", persona);
#endif
return 0;
}
static int setup_dev_console(const struct lxc_rootfs *rootfs,
const struct lxc_console *console)
{
char path[MAXPATHLEN];
struct stat s;
int ret;
ret = snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
if (ret >= sizeof(path)) {
ERROR("console path too long");
return -1;
}
if (access(path, F_OK)) {
WARN("rootfs specified but no console found at '%s'", path);
return 0;
}
if (console->master < 0) {
INFO("no console");
return 0;
}
if (stat(path, &s)) {
SYSERROR("failed to stat '%s'", path);
return -1;
}
if (chmod(console->name, s.st_mode)) {
SYSERROR("failed to set mode '0%o' to '%s'",
s.st_mode, console->name);
return -1;
}
if (mount(console->name, path, "none", MS_BIND, 0)) {
ERROR("failed to mount '%s' on '%s'", console->name, path);
return -1;
}
INFO("console has been setup");
return 0;
}
static int setup_ttydir_console(const struct lxc_rootfs *rootfs,
const struct lxc_console *console,
char *ttydir)
{
char path[MAXPATHLEN], lxcpath[MAXPATHLEN];
int ret;
/* create rootfs/dev/<ttydir> directory */
ret = snprintf(path, sizeof(path), "%s/dev/%s", rootfs->mount,
ttydir);
if (ret >= sizeof(path))
return -1;
ret = mkdir(path, 0755);
if (ret && errno != EEXIST) {
SYSERROR("failed with errno %d to create %s", errno, path);
return -1;
}
INFO("created %s", path);
ret = snprintf(lxcpath, sizeof(lxcpath), "%s/dev/%s/console",
rootfs->mount, ttydir);
if (ret >= sizeof(lxcpath)) {
ERROR("console path too long");
return -1;
}
snprintf(path, sizeof(path), "%s/dev/console", rootfs->mount);
ret = unlink(path);
if (ret && errno != ENOENT) {
SYSERROR("error unlinking %s", path);
return -1;
}
ret = creat(lxcpath, 0660);
if (ret==-1 && errno != EEXIST) {
SYSERROR("error %d creating %s", errno, lxcpath);
return -1;
}
if (ret >= 0)
close(ret);
if (console->master < 0) {
INFO("no console");
return 0;
}
if (mount(console->name, lxcpath, "none", MS_BIND, 0)) {
ERROR("failed to mount '%s' on '%s'", console->name, lxcpath);
return -1;
}
/* create symlink from rootfs/dev/console to 'lxc/console' */
ret = snprintf(lxcpath, sizeof(lxcpath), "%s/console", ttydir);
if (ret >= sizeof(lxcpath)) {
ERROR("lxc/console path too long");
return -1;
}
ret = symlink(lxcpath, path);
if (ret) {
SYSERROR("failed to create symlink for console");
return -1;
}
INFO("console has been setup on %s", lxcpath);
return 0;
}
static int setup_console(const struct lxc_rootfs *rootfs,
const struct lxc_console *console,
char *ttydir)
{
/* We don't have a rootfs, /dev/console will be shared */
if (!rootfs->path)
return 0;
if (!ttydir)
return setup_dev_console(rootfs, console);
return setup_ttydir_console(rootfs, console, ttydir);
}
static int setup_kmsg(const struct lxc_rootfs *rootfs,
const struct lxc_console *console)
{
char kpath[MAXPATHLEN];
int ret;
if (!rootfs->path)
return 0;
ret = snprintf(kpath, sizeof(kpath), "%s/dev/kmsg", rootfs->mount);
if (ret < 0 || ret >= sizeof(kpath))
return -1;
ret = unlink(kpath);
if (ret && errno != ENOENT) {
SYSERROR("error unlinking %s", kpath);
return -1;
}
ret = symlink("console", kpath);
if (ret) {
SYSERROR("failed to create symlink for kmsg");
return -1;
}
return 0;
}
static void parse_mntopt(char *opt, unsigned long *flags, char **data)
{
struct mount_opt *mo;
/* If opt is found in mount_opt, set or clear flags.
* Otherwise append it to data. */
for (mo = &mount_opt[0]; mo->name != NULL; mo++) {
if (!strncmp(opt, mo->name, strlen(mo->name))) {
if (mo->clear)
*flags &= ~mo->flag;
else
*flags |= mo->flag;
return;
}
}
if (strlen(*data))
strcat(*data, ",");
strcat(*data, opt);
}
int parse_mntopts(const char *mntopts, unsigned long *mntflags,
char **mntdata)
{
char *s, *data;
char *p, *saveptr = NULL;
*mntdata = NULL;
*mntflags = 0L;
if (!mntopts)
return 0;
s = strdup(mntopts);
if (!s) {
SYSERROR("failed to allocate memory");
return -1;
}
data = malloc(strlen(s) + 1);
if (!data) {
SYSERROR("failed to allocate memory");
free(s);
return -1;
}
*data = 0;
for (p = strtok_r(s, ",", &saveptr); p != NULL;
p = strtok_r(NULL, ",", &saveptr))
parse_mntopt(p, mntflags, &data);
if (*data)
*mntdata = data;
else
free(data);
free(s);
return 0;
}
static int mount_entry(const char *fsname, const char *target,
const char *fstype, unsigned long mountflags,
const char *data)
{
if (mount(fsname, target, fstype, mountflags & ~MS_REMOUNT, data)) {
SYSERROR("failed to mount '%s' on '%s'", fsname, target);
return -1;
}
if ((mountflags & MS_REMOUNT) || (mountflags & MS_BIND)) {
DEBUG("remounting %s on %s to respect bind or remount options",
fsname, target);
if (mount(fsname, target, fstype,
mountflags | MS_REMOUNT, data)) {
SYSERROR("failed to mount '%s' on '%s'",
fsname, target);
return -1;
}
}
DEBUG("mounted '%s' on '%s', type '%s'", fsname, target, fstype);
return 0;
}
static inline int mount_entry_on_systemfs(const struct mntent *mntent)
{
unsigned long mntflags;
char *mntdata;
int ret;
FILE *pathfile = NULL;
char* pathdirname = NULL;
if (hasmntopt(mntent, "create=dir")) {
if (!mkdir_p(mntent->mnt_dir, 0755)) {
WARN("Failed to create mount target '%s'", mntent->mnt_dir);
ret = -1;
}
}
if (hasmntopt(mntent, "create=file") && access(mntent->mnt_dir, F_OK)) {
pathdirname = strdup(mntent->mnt_dir);
pathdirname = dirname(pathdirname);
mkdir_p(pathdirname, 0755);
pathfile = fopen(mntent->mnt_dir, "wb");
if (!pathfile) {
WARN("Failed to create mount target '%s'", mntent->mnt_dir);
ret = -1;
}
else
fclose(pathfile);
}
if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
free(mntdata);
return -1;
}
ret = mount_entry(mntent->mnt_fsname, mntent->mnt_dir,
mntent->mnt_type, mntflags, mntdata);
if (hasmntopt(mntent, "optional") != NULL)
ret = 0;
free(pathdirname);
free(mntdata);
return ret;
}
static int mount_entry_on_absolute_rootfs(const struct mntent *mntent,
const struct lxc_rootfs *rootfs,
const char *lxc_name)
{
char *aux;
char path[MAXPATHLEN];
unsigned long mntflags;
char *mntdata;
int r, ret = 0, offset;
const char *lxcpath;
FILE *pathfile = NULL;
char *pathdirname = NULL;
lxcpath = lxc_global_config_value("lxc.lxcpath");
if (!lxcpath) {
ERROR("Out of memory");
return -1;
}
/* if rootfs->path is a blockdev path, allow container fstab to
* use $lxcpath/CN/rootfs as the target prefix */
r = snprintf(path, MAXPATHLEN, "%s/%s/rootfs", lxcpath, lxc_name);
if (r < 0 || r >= MAXPATHLEN)
goto skipvarlib;
aux = strstr(mntent->mnt_dir, path);
if (aux) {
offset = strlen(path);
goto skipabs;
}
skipvarlib:
aux = strstr(mntent->mnt_dir, rootfs->path);
if (!aux) {
WARN("ignoring mount point '%s'", mntent->mnt_dir);
goto out;
}
offset = strlen(rootfs->path);
skipabs:
r = snprintf(path, MAXPATHLEN, "%s/%s", rootfs->mount,
aux + offset);
if (r < 0 || r >= MAXPATHLEN) {
WARN("pathnme too long for '%s'", mntent->mnt_dir);
ret = -1;
goto out;
}
if (hasmntopt(mntent, "create=dir")) {
if (!mkdir_p(path, 0755)) {
WARN("Failed to create mount target '%s'", path);
ret = -1;
}
}
if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
pathdirname = strdup(path);
pathdirname = dirname(pathdirname);
mkdir_p(pathdirname, 0755);
pathfile = fopen(path, "wb");
if (!pathfile) {
WARN("Failed to create mount target '%s'", path);
ret = -1;
}
else
fclose(pathfile);
}
if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
free(mntdata);
return -1;
}
ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
mntflags, mntdata);
free(mntdata);
if (hasmntopt(mntent, "optional") != NULL)
ret = 0;
out:
free(pathdirname);
return ret;
}
static int mount_entry_on_relative_rootfs(const struct mntent *mntent,
const char *rootfs)
{
char path[MAXPATHLEN];
unsigned long mntflags;
char *mntdata;
int ret;
FILE *pathfile = NULL;
char *pathdirname = NULL;
/* relative to root mount point */
ret = snprintf(path, sizeof(path), "%s/%s", rootfs, mntent->mnt_dir);
if (ret >= sizeof(path)) {
ERROR("path name too long");
return -1;
}
if (hasmntopt(mntent, "create=dir")) {
if (!mkdir_p(path, 0755)) {
WARN("Failed to create mount target '%s'", path);
ret = -1;
}
}
if (hasmntopt(mntent, "create=file") && access(path, F_OK)) {
pathdirname = strdup(path);
pathdirname = dirname(pathdirname);
mkdir_p(pathdirname, 0755);
pathfile = fopen(path, "wb");
if (!pathfile) {
WARN("Failed to create mount target '%s'", path);
ret = -1;
}
else
fclose(pathfile);
}
if (parse_mntopts(mntent->mnt_opts, &mntflags, &mntdata) < 0) {
free(mntdata);
return -1;
}
ret = mount_entry(mntent->mnt_fsname, path, mntent->mnt_type,
mntflags, mntdata);
if (hasmntopt(mntent, "optional") != NULL)
ret = 0;
free(pathdirname);
free(mntdata);
return ret;
}
static int mount_file_entries(const struct lxc_rootfs *rootfs, FILE *file,
const char *lxc_name)
{
struct mntent mntent;
char buf[4096];
int ret = -1;
while (getmntent_r(file, &mntent, buf, sizeof(buf))) {
if (!rootfs->path) {
if (mount_entry_on_systemfs(&mntent))
goto out;
continue;
}
/* We have a separate root, mounts are relative to it */
if (mntent.mnt_dir[0] != '/') {
if (mount_entry_on_relative_rootfs(&mntent,
rootfs->mount))
goto out;
continue;
}
if (mount_entry_on_absolute_rootfs(&mntent, rootfs, lxc_name))
goto out;
}
ret = 0;
INFO("mount points have been setup");
out:
return ret;
}
static int setup_mount(const struct lxc_rootfs *rootfs, const char *fstab,
const char *lxc_name)
{
FILE *file;
int ret;
if (!fstab)
return 0;
file = setmntent(fstab, "r");
if (!file) {
SYSERROR("failed to use '%s'", fstab);
return -1;
}
ret = mount_file_entries(rootfs, file, lxc_name);
endmntent(file);
return ret;
}
static int setup_mount_entries(const struct lxc_rootfs *rootfs, struct lxc_list *mount,
const char *lxc_name)
{
FILE *file;
struct lxc_list *iterator;
char *mount_entry;
int ret;
file = tmpfile();
if (!file) {
ERROR("tmpfile error: %m");
return -1;
}
lxc_list_for_each(iterator, mount) {
mount_entry = iterator->elem;
fprintf(file, "%s\n", mount_entry);
}
rewind(file);
ret = mount_file_entries(rootfs, file, lxc_name);
fclose(file);
return ret;
}
static int setup_caps(struct lxc_list *caps)
{
struct lxc_list *iterator;
char *drop_entry;
char *ptr;
int i, capid;
lxc_list_for_each(iterator, caps) {
drop_entry = iterator->elem;
capid = -1;
for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
if (strcmp(drop_entry, caps_opt[i].name))
continue;
capid = caps_opt[i].value;
break;
}
if (capid < 0) {
/* try to see if it's numeric, so the user may specify
* capabilities that the running kernel knows about but
* we don't */
errno = 0;
capid = strtol(drop_entry, &ptr, 10);
if (!ptr || *ptr != '\0' || errno != 0)
/* not a valid number */
capid = -1;
else if (capid > lxc_caps_last_cap())
/* we have a number but it's not a valid
* capability */
capid = -1;
}
if (capid < 0) {
ERROR("unknown capability %s", drop_entry);
return -1;
}
DEBUG("drop capability '%s' (%d)", drop_entry, capid);
if (prctl(PR_CAPBSET_DROP, capid, 0, 0, 0)) {
SYSERROR("failed to remove %s capability", drop_entry);
return -1;
}
}
DEBUG("capabilities have been setup");
return 0;
}
static int dropcaps_except(struct lxc_list *caps)
{
struct lxc_list *iterator;
char *keep_entry;
char *ptr;
int i, capid;
int numcaps = lxc_caps_last_cap() + 1;
INFO("found %d capabilities", numcaps);
if (numcaps <= 0 || numcaps > 200)
return -1;
// caplist[i] is 1 if we keep capability i
int *caplist = alloca(numcaps * sizeof(int));
memset(caplist, 0, numcaps * sizeof(int));
lxc_list_for_each(iterator, caps) {
keep_entry = iterator->elem;
capid = -1;
for (i = 0; i < sizeof(caps_opt)/sizeof(caps_opt[0]); i++) {
if (strcmp(keep_entry, caps_opt[i].name))
continue;
capid = caps_opt[i].value;
break;
}
if (capid < 0) {
/* try to see if it's numeric, so the user may specify
* capabilities that the running kernel knows about but
* we don't */
capid = strtol(keep_entry, &ptr, 10);
if (!ptr || *ptr != '\0' ||
capid == INT_MIN || capid == INT_MAX)
/* not a valid number */
capid = -1;
else if (capid > lxc_caps_last_cap())
/* we have a number but it's not a valid
* capability */
capid = -1;
}
if (capid < 0) {
ERROR("unknown capability %s", keep_entry);
return -1;
}
DEBUG("drop capability '%s' (%d)", keep_entry, capid);
caplist[capid] = 1;
}
for (i=0; i<numcaps; i++) {
if (caplist[i])
continue;
if (prctl(PR_CAPBSET_DROP, i, 0, 0, 0)) {
SYSERROR("failed to remove capability %d", i);
return -1;
}
}
DEBUG("capabilities have been setup");
return 0;
}
static int setup_hw_addr(char *hwaddr, const char *ifname)
{
struct sockaddr sockaddr;
struct ifreq ifr;
int ret, fd;
ret = lxc_convert_mac(hwaddr, &sockaddr);
if (ret) {
ERROR("mac address '%s' conversion failed : %s",
hwaddr, strerror(-ret));
return -1;
}
memcpy(ifr.ifr_name, ifname, IFNAMSIZ);
ifr.ifr_name[IFNAMSIZ-1] = '\0';
memcpy((char *) &ifr.ifr_hwaddr, (char *) &sockaddr, sizeof(sockaddr));
fd = socket(AF_INET, SOCK_DGRAM, 0);
if (fd < 0) {
ERROR("socket failure : %s", strerror(errno));
return -1;
}
ret = ioctl(fd, SIOCSIFHWADDR, &ifr);
close(fd);
if (ret)
ERROR("ioctl failure : %s", strerror(errno));
DEBUG("mac address '%s' on '%s' has been setup", hwaddr, ifr.ifr_name);
return ret;
}
static int setup_ipv4_addr(struct lxc_list *ip, int ifindex)
{
struct lxc_list *iterator;
struct lxc_inetdev *inetdev;
int err;
lxc_list_for_each(iterator, ip) {
inetdev = iterator->elem;
err = lxc_ipv4_addr_add(ifindex, &inetdev->addr,
&inetdev->bcast, inetdev->prefix);
if (err) {
ERROR("failed to setup_ipv4_addr ifindex %d : %s",
ifindex, strerror(-err));
return -1;
}
}
return 0;
}
static int setup_ipv6_addr(struct lxc_list *ip, int ifindex)
{
struct lxc_list *iterator;
struct lxc_inet6dev *inet6dev;
int err;
lxc_list_for_each(iterator, ip) {
inet6dev = iterator->elem;
err = lxc_ipv6_addr_add(ifindex, &inet6dev->addr,
&inet6dev->mcast, &inet6dev->acast,
inet6dev->prefix);
if (err) {
ERROR("failed to setup_ipv6_addr ifindex %d : %s",
ifindex, strerror(-err));
return -1;
}
}
return 0;
}
static int setup_netdev(struct lxc_netdev *netdev)
{
char ifname[IFNAMSIZ];
char *current_ifname = ifname;
int err;
/* empty network namespace */
if (!netdev->ifindex) {
if (netdev->flags & IFF_UP) {
err = lxc_netdev_up("lo");
if (err) {
ERROR("failed to set the loopback up : %s",
strerror(-err));
return -1;
}
}
return 0;
}
/* get the new ifindex in case of physical netdev */
if (netdev->type == LXC_NET_PHYS)
if (!(netdev->ifindex = if_nametoindex(netdev->link))) {
ERROR("failed to get ifindex for %s",
netdev->link);
return -1;
}
/* retrieve the name of the interface */
if (!if_indextoname(netdev->ifindex, current_ifname)) {
ERROR("no interface corresponding to index '%d'",
netdev->ifindex);
return -1;
}
/* default: let the system to choose one interface name */
if (!netdev->name)
netdev->name = netdev->type == LXC_NET_PHYS ?
netdev->link : "eth%d";
/* rename the interface name */
err = lxc_netdev_rename_by_name(ifname, netdev->name);
if (err) {
ERROR("failed to rename %s->%s : %s", ifname, netdev->name,
strerror(-err));
return -1;
}
/* Re-read the name of the interface because its name has changed
* and would be automatically allocated by the system
*/
if (!if_indextoname(netdev->ifindex, current_ifname)) {
ERROR("no interface corresponding to index '%d'",
netdev->ifindex);
return -1;
}
/* set a mac address */
if (netdev->hwaddr) {
if (setup_hw_addr(netdev->hwaddr, current_ifname)) {
ERROR("failed to setup hw address for '%s'",
current_ifname);
return -1;
}
}
/* setup ipv4 addresses on the interface */
if (setup_ipv4_addr(&netdev->ipv4, netdev->ifindex)) {
ERROR("failed to setup ip addresses for '%s'",
ifname);
return -1;
}
/* setup ipv6 addresses on the interface */
if (setup_ipv6_addr(&netdev->ipv6, netdev->ifindex)) {
ERROR("failed to setup ipv6 addresses for '%s'",
ifname);
return -1;
}
/* set the network device up */
if (netdev->flags & IFF_UP) {
int err;
err = lxc_netdev_up(current_ifname);
if (err) {
ERROR("failed to set '%s' up : %s", current_ifname,
strerror(-err));
return -1;
}
/* the network is up, make the loopback up too */
err = lxc_netdev_up("lo");
if (err) {
ERROR("failed to set the loopback up : %s",
strerror(-err));
return -1;
}
}
/* We can only set up the default routes after bringing
* up the interface, sine bringing up the interface adds
* the link-local routes and we can't add a default
* route if the gateway is not reachable. */
/* setup ipv4 gateway on the interface */
if (netdev->ipv4_gateway) {
if (!(netdev->flags & IFF_UP)) {
ERROR("Cannot add ipv4 gateway for %s when not bringing up the interface", ifname);
return -1;
}
if (lxc_list_empty(&netdev->ipv4)) {
ERROR("Cannot add ipv4 gateway for %s when not assigning an address", ifname);
return -1;
}
err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
if (err) {
err = lxc_ipv4_dest_add(netdev->ifindex, netdev->ipv4_gateway);
if (err) {
ERROR("failed to add ipv4 dest for '%s': %s",
ifname, strerror(-err));
}
err = lxc_ipv4_gateway_add(netdev->ifindex, netdev->ipv4_gateway);
if (err) {
ERROR("failed to setup ipv4 gateway for '%s': %s",
ifname, strerror(-err));
if (netdev->ipv4_gateway_auto) {
char buf[INET_ADDRSTRLEN];
inet_ntop(AF_INET, netdev->ipv4_gateway, buf, sizeof(buf));
ERROR("tried to set autodetected ipv4 gateway '%s'", buf);
}
return -1;
}
}
}
/* setup ipv6 gateway on the interface */
if (netdev->ipv6_gateway) {
if (!(netdev->flags & IFF_UP)) {
ERROR("Cannot add ipv6 gateway for %s when not bringing up the interface", ifname);
return -1;
}
if (lxc_list_empty(&netdev->ipv6) && !IN6_IS_ADDR_LINKLOCAL(netdev->ipv6_gateway)) {
ERROR("Cannot add ipv6 gateway for %s when not assigning an address", ifname);
return -1;
}
err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
if (err) {
err = lxc_ipv6_dest_add(netdev->ifindex, netdev->ipv6_gateway);
if (err) {
ERROR("failed to add ipv6 dest for '%s': %s",
ifname, strerror(-err));
}
err = lxc_ipv6_gateway_add(netdev->ifindex, netdev->ipv6_gateway);
if (err) {
ERROR("failed to setup ipv6 gateway for '%s': %s",
ifname, strerror(-err));
if (netdev->ipv6_gateway_auto) {
char buf[INET6_ADDRSTRLEN];
inet_ntop(AF_INET6, netdev->ipv6_gateway, buf, sizeof(buf));
ERROR("tried to set autodetected ipv6 gateway '%s'", buf);
}
return -1;
}
}
}
DEBUG("'%s' has been setup", current_ifname);
return 0;
}
static int setup_network(struct lxc_list *network)
{
struct lxc_list *iterator;
struct lxc_netdev *netdev;
lxc_list_for_each(iterator, network) {
netdev = iterator->elem;
if (setup_netdev(netdev)) {
ERROR("failed to setup netdev");
return -1;
}
}
if (!lxc_list_empty(network))
INFO("network has been setup");
return 0;
}
void lxc_rename_phys_nics_on_shutdown(struct lxc_conf *conf)
{
int i;
INFO("running to reset %d nic names", conf->num_savednics);
for (i=0; i<conf->num_savednics; i++) {
struct saved_nic *s = &conf->saved_nics[i];
INFO("resetting nic %d to %s", s->ifindex, s->orig_name);
lxc_netdev_rename_by_index(s->ifindex, s->orig_name);
free(s->orig_name);
}
conf->num_savednics = 0;
free(conf->saved_nics);
}
static char *default_rootfs_mount = LXCROOTFSMOUNT;
struct lxc_conf *lxc_conf_init(void)
{
struct lxc_conf *new;
int i;
new = malloc(sizeof(*new));
if (!new) {
ERROR("lxc_conf_init : %m");
return NULL;
}
memset(new, 0, sizeof(*new));
new->loglevel = LXC_LOG_PRIORITY_NOTSET;
new->personality = -1;
new->autodev = -1;
new->console.log_path = NULL;
new->console.log_fd = -1;
new->console.path = NULL;
new->console.peer = -1;
new->console.peerpty.busy = -1;
new->console.peerpty.master = -1;
new->console.peerpty.slave = -1;
new->console.master = -1;
new->console.slave = -1;
new->console.name[0] = '\0';
new->maincmd_fd = -1;
new->rootfs.mount = strdup(default_rootfs_mount);
if (!new->rootfs.mount) {
ERROR("lxc_conf_init : %m");
free(new);
return NULL;
}
new->kmsg = 1;
lxc_list_init(&new->cgroup);
lxc_list_init(&new->network);
lxc_list_init(&new->mount_list);
lxc_list_init(&new->caps);
lxc_list_init(&new->keepcaps);
lxc_list_init(&new->id_map);
for (i=0; i<NUM_LXC_HOOKS; i++)
lxc_list_init(&new->hooks[i]);
lxc_list_init(&new->groups);
new->lsm_aa_profile = NULL;
new->lsm_se_context = NULL;
new->lsm_umount_proc = 0;
for (i = 0; i < LXC_NS_MAX; i++)
new->inherit_ns_fd[i] = -1;
return new;
}
static int instanciate_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
char veth1buf[IFNAMSIZ], *veth1;
char veth2buf[IFNAMSIZ], *veth2;
int err;
if (netdev->priv.veth_attr.pair)
veth1 = netdev->priv.veth_attr.pair;
else {
err = snprintf(veth1buf, sizeof(veth1buf), "vethXXXXXX");
if (err >= sizeof(veth1buf)) { /* can't *really* happen, but... */
ERROR("veth1 name too long");
return -1;
}
veth1 = lxc_mkifname(veth1buf);
if (!veth1) {
ERROR("failed to allocate a temporary name");
return -1;
}
/* store away for deconf */
memcpy(netdev->priv.veth_attr.veth1, veth1, IFNAMSIZ);
}
snprintf(veth2buf, sizeof(veth2buf), "vethXXXXXX");
veth2 = lxc_mkifname(veth2buf);
if (!veth2) {
ERROR("failed to allocate a temporary name");
goto out_delete;
}
err = lxc_veth_create(veth1, veth2);
if (err) {
ERROR("failed to create %s-%s : %s", veth1, veth2,
strerror(-err));
goto out_delete;
}
/* changing the high byte of the mac address to 0xfe, the bridge interface
* will always keep the host's mac address and not take the mac address
* of a container */
err = setup_private_host_hw_addr(veth1);
if (err) {
ERROR("failed to change mac address of host interface '%s' : %s",
veth1, strerror(-err));
goto out_delete;
}
if (netdev->mtu) {
err = lxc_netdev_set_mtu(veth1, atoi(netdev->mtu));
if (!err)
err = lxc_netdev_set_mtu(veth2, atoi(netdev->mtu));
if (err) {
ERROR("failed to set mtu '%s' for %s-%s : %s",
netdev->mtu, veth1, veth2, strerror(-err));
goto out_delete;
}
}
if (netdev->link) {
err = lxc_bridge_attach(netdev->link, veth1);
if (err) {
ERROR("failed to attach '%s' to the bridge '%s' : %s",
veth1, netdev->link, strerror(-err));
goto out_delete;
}
}
netdev->ifindex = if_nametoindex(veth2);
if (!netdev->ifindex) {
ERROR("failed to retrieve the index for %s", veth2);
goto out_delete;
}
err = lxc_netdev_up(veth1);
if (err) {
ERROR("failed to set %s up : %s", veth1, strerror(-err));
goto out_delete;
}
if (netdev->upscript) {
err = run_script(handler->name, "net", netdev->upscript, "up",
"veth", veth1, (char*) NULL);
if (err)
goto out_delete;
}
DEBUG("instanciated veth '%s/%s', index is '%d'",
veth1, veth2, netdev->ifindex);
return 0;
out_delete:
lxc_netdev_delete_by_name(veth1);
if (!netdev->priv.veth_attr.pair && veth1)
free(veth1);
if(veth2)
free(veth2);
return -1;
}
static int shutdown_veth(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
char *veth1;
int err;
if (netdev->priv.veth_attr.pair)
veth1 = netdev->priv.veth_attr.pair;
else
veth1 = netdev->priv.veth_attr.veth1;
if (netdev->downscript) {
err = run_script(handler->name, "net", netdev->downscript,
"down", "veth", veth1, (char*) NULL);
if (err)
return -1;
}
return 0;
}
static int instanciate_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
char peerbuf[IFNAMSIZ], *peer;
int err;
if (!netdev->link) {
ERROR("no link specified for macvlan netdev");
return -1;
}
err = snprintf(peerbuf, sizeof(peerbuf), "mcXXXXXX");
if (err >= sizeof(peerbuf))
return -1;
peer = lxc_mkifname(peerbuf);
if (!peer) {
ERROR("failed to make a temporary name");
return -1;
}
err = lxc_macvlan_create(netdev->link, peer,
netdev->priv.macvlan_attr.mode);
if (err) {
ERROR("failed to create macvlan interface '%s' on '%s' : %s",
peer, netdev->link, strerror(-err));
goto out;
}
netdev->ifindex = if_nametoindex(peer);
if (!netdev->ifindex) {
ERROR("failed to retrieve the index for %s", peer);
goto out;
}
if (netdev->upscript) {
err = run_script(handler->name, "net", netdev->upscript, "up",
"macvlan", netdev->link, (char*) NULL);
if (err)
goto out;
}
DEBUG("instanciated macvlan '%s', index is '%d' and mode '%d'",
peer, netdev->ifindex, netdev->priv.macvlan_attr.mode);
return 0;
out:
lxc_netdev_delete_by_name(peer);
free(peer);
return -1;
}
static int shutdown_macvlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
int err;
if (netdev->downscript) {
err = run_script(handler->name, "net", netdev->downscript,
"down", "macvlan", netdev->link,
(char*) NULL);
if (err)
return -1;
}
return 0;
}
/* XXX: merge with instanciate_macvlan */
static int instanciate_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
char peer[IFNAMSIZ];
int err;
if (!netdev->link) {
ERROR("no link specified for vlan netdev");
return -1;
}
err = snprintf(peer, sizeof(peer), "vlan%d", netdev->priv.vlan_attr.vid);
if (err >= sizeof(peer)) {
ERROR("peer name too long");
return -1;
}
err = lxc_vlan_create(netdev->link, peer, netdev->priv.vlan_attr.vid);
if (err) {
ERROR("failed to create vlan interface '%s' on '%s' : %s",
peer, netdev->link, strerror(-err));
return -1;
}
netdev->ifindex = if_nametoindex(peer);
if (!netdev->ifindex) {
ERROR("failed to retrieve the ifindex for %s", peer);
lxc_netdev_delete_by_name(peer);
return -1;
}
DEBUG("instanciated vlan '%s', ifindex is '%d'", " vlan1000",
netdev->ifindex);
return 0;
}
static int shutdown_vlan(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
return 0;
}
static int instanciate_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
if (!netdev->link) {
ERROR("no link specified for the physical interface");
return -1;
}
netdev->ifindex = if_nametoindex(netdev->link);
if (!netdev->ifindex) {
ERROR("failed to retrieve the index for %s", netdev->link);
return -1;
}
if (netdev->upscript) {
int err;
err = run_script(handler->name, "net", netdev->upscript,
"up", "phys", netdev->link, (char*) NULL);
if (err)
return -1;
}
return 0;
}
static int shutdown_phys(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
int err;
if (netdev->downscript) {
err = run_script(handler->name, "net", netdev->downscript,
"down", "phys", netdev->link, (char*) NULL);
if (err)
return -1;
}
return 0;
}
static int instanciate_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
netdev->ifindex = 0;
return 0;
}
static int instanciate_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
netdev->ifindex = 0;
if (netdev->upscript) {
int err;
err = run_script(handler->name, "net", netdev->upscript,
"up", "empty", (char*) NULL);
if (err)
return -1;
}
return 0;
}
static int shutdown_empty(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
int err;
if (netdev->downscript) {
err = run_script(handler->name, "net", netdev->downscript,
"down", "empty", (char*) NULL);
if (err)
return -1;
}
return 0;
}
static int shutdown_none(struct lxc_handler *handler, struct lxc_netdev *netdev)
{
return 0;
}
int lxc_requests_empty_network(struct lxc_handler *handler)
{
struct lxc_list *network = &handler->conf->network;
struct lxc_list *iterator;
struct lxc_netdev *netdev;
bool found_none = false, found_nic = false;
if (lxc_list_empty(network))
return 0;
lxc_list_for_each(iterator, network) {
netdev = iterator->elem;
if (netdev->type == LXC_NET_NONE)
found_none = true;
else
found_nic = true;
}
if (found_none && !found_nic)
return 1;
return 0;
}
int lxc_create_network(struct lxc_handler *handler)
{
struct lxc_list *network = &handler->conf->network;
struct lxc_list *iterator;
struct lxc_netdev *netdev;
int am_root = (getuid() == 0);
if (!am_root)
return 0;
lxc_list_for_each(iterator, network) {
netdev = iterator->elem;
if (netdev->type < 0 || netdev->type > LXC_NET_MAXCONFTYPE) {
ERROR("invalid network configuration type '%d'",
netdev->type);
return -1;
}
if (netdev_conf[netdev->type](handler, netdev)) {
ERROR("failed to create netdev");
return -1;
}
}
return 0;
}
void lxc_delete_network(struct lxc_handler *handler)
{
struct lxc_list *network = &handler->conf->network;
struct lxc_list *iterator;
struct lxc_netdev *netdev;
lxc_list_for_each(iterator, network) {
netdev = iterator->elem;
if (netdev->ifindex != 0 && netdev->type == LXC_NET_PHYS) {
if (lxc_netdev_rename_by_index(netdev->ifindex, netdev->link))
WARN("failed to rename to the initial name the " \
"netdev '%s'", netdev->link);
continue;
}
if (netdev_deconf[netdev->type](handler, netdev)) {
WARN("failed to destroy netdev");
}
/* Recent kernel remove the virtual interfaces when the network
* namespace is destroyed but in case we did not moved the
* interface to the network namespace, we have to destroy it
*/
if (netdev->ifindex != 0 &&
lxc_netdev_delete_by_index(netdev->ifindex))
WARN("failed to remove interface '%s'", netdev->name);
}
}
#define LXC_USERNIC_PATH LIBEXECDIR "/lxc/lxc-user-nic"
static int unpriv_assign_nic(struct lxc_netdev *netdev, pid_t pid)
{
pid_t child;
if (netdev->type != LXC_NET_VETH) {
ERROR("nic type %d not support for unprivileged use",
netdev->type);
return -1;
}
if ((child = fork()) < 0) {
SYSERROR("fork");
return -1;
}
if (child > 0)
return wait_for_pid(child);
// Call lxc-user-nic pid type bridge
char pidstr[20];
char *args[] = {LXC_USERNIC_PATH, pidstr, "veth", netdev->link, netdev->name, NULL };
snprintf(pidstr, 19, "%lu", (unsigned long) pid);
pidstr[19] = '\0';
execvp(args[0], args);
SYSERROR("execvp lxc-user-nic");
exit(1);
}
int lxc_assign_network(struct lxc_list *network, pid_t pid)
{
struct lxc_list *iterator;
struct lxc_netdev *netdev;
int am_root = (getuid() == 0);
int err;
lxc_list_for_each(iterator, network) {
netdev = iterator->elem;
if (netdev->type == LXC_NET_VETH && !am_root) {
if (unpriv_assign_nic(netdev, pid))
return -1;
// TODO fill in netdev->ifindex and name
continue;
}
/* empty network namespace, nothing to move */
if (!netdev->ifindex)
continue;
err = lxc_netdev_move_by_index(netdev->ifindex, pid);
if (err) {
ERROR("failed to move '%s' to the container : %s",
netdev->link, strerror(-err));
return -1;
}
DEBUG("move '%s' to '%d'", netdev->name, pid);
}
return 0;
}
static int write_id_mapping(enum idtype idtype, pid_t pid, const char *buf,
size_t buf_size)
{
char path[PATH_MAX];
int ret, closeret;
FILE *f;
ret = snprintf(path, PATH_MAX, "/proc/%d/%cid_map", pid, idtype == ID_TYPE_UID ? 'u' : 'g');
if (ret < 0 || ret >= PATH_MAX) {
fprintf(stderr, "%s: path name too long", __func__);
return -E2BIG;
}
f = fopen(path, "w");
if (!f) {
perror("open");
return -EINVAL;
}
ret = fwrite(buf, buf_size, 1, f);
if (ret < 0)
SYSERROR("writing id mapping");
closeret = fclose(f);
if (closeret)
SYSERROR("writing id mapping");
return ret < 0 ? ret : closeret;
}
int lxc_map_ids(struct lxc_list *idmap, pid_t pid)
{
struct lxc_list *iterator;
struct id_map *map;
int ret = 0;
enum idtype type;
char *buf = NULL, *pos;
int am_root = (getuid() == 0);
for(type = ID_TYPE_UID; type <= ID_TYPE_GID; type++) {
int left, fill;
int had_entry = 0;
if (!buf) {
buf = pos = malloc(4096);
if (!buf)
return -ENOMEM;
}
pos = buf;
if (!am_root)
pos += sprintf(buf, "new%cidmap %d",
type == ID_TYPE_UID ? 'u' : 'g',
pid);
lxc_list_for_each(iterator, idmap) {
/* The kernel only takes <= 4k for writes to /proc/<nr>/[ug]id_map */
map = iterator->elem;
if (map->idtype != type)
continue;
had_entry = 1;
left = 4096 - (pos - buf);
fill = snprintf(pos, left, "%s%lu %lu %lu%s",
am_root ? "" : " ",
map->nsid, map->hostid, map->range,
am_root ? "\n" : "");
if (fill <= 0 || fill >= left)
SYSERROR("snprintf failed, too many mappings");
pos += fill;
}
if (!had_entry)
continue;
if (am_root) {
ret = write_id_mapping(type, pid, buf, pos-buf);
} else {
left = 4096 - (pos - buf);
fill = snprintf(pos, left, "\n");
if (fill <= 0 || fill >= left)
SYSERROR("snprintf failed, too many mappings");
pos += fill;
ret = system(buf);
}
if (ret)
break;
}
if (buf)
free(buf);
return ret;
}
/*
* return the host uid to which the container root is mapped in *val.
* Return true if id was found, false otherwise.
*/
bool get_mapped_rootid(struct lxc_conf *conf, enum idtype idtype,
unsigned long *val)
{
struct lxc_list *it;
struct id_map *map;
lxc_list_for_each(it, &conf->id_map) {
map = it->elem;
if (map->idtype != ID_TYPE_UID)
continue;
if (map->nsid != 0)
continue;
*val = map->hostid;
return true;
}
return false;
}
int mapped_hostid(unsigned id, struct lxc_conf *conf, enum idtype idtype)
{
struct lxc_list *it;
struct id_map *map;
lxc_list_for_each(it, &conf->id_map) {
map = it->elem;
if (map->idtype != idtype)
continue;
if (id >= map->hostid && id < map->hostid + map->range)
return (id - map->hostid) + map->nsid;
}
return -1;
}
int find_unmapped_nsuid(struct lxc_conf *conf, enum idtype idtype)
{
struct lxc_list *it;
struct id_map *map;
unsigned int freeid = 0;
again:
lxc_list_for_each(it, &conf->id_map) {
map = it->elem;
if (map->idtype != idtype)
continue;
if (freeid >= map->nsid && freeid < map->nsid + map->range) {
freeid = map->nsid + map->range;
goto again;
}
}
return freeid;
}
int lxc_find_gateway_addresses(struct lxc_handler *handler)
{
struct lxc_list *network = &handler->conf->network;
struct lxc_list *iterator;
struct lxc_netdev *netdev;
int link_index;
lxc_list_for_each(iterator, network) {
netdev = iterator->elem;
if (!netdev->ipv4_gateway_auto && !netdev->ipv6_gateway_auto)
continue;
if (netdev->type != LXC_NET_VETH && netdev->type != LXC_NET_MACVLAN) {
ERROR("gateway = auto only supported for "
"veth and macvlan");
return -1;
}
if (!netdev->link) {
ERROR("gateway = auto needs a link interface");
return -1;
}
link_index = if_nametoindex(netdev->link);
if (!link_index)
return -EINVAL;
if (netdev->ipv4_gateway_auto) {
if (lxc_ipv4_addr_get(link_index, &netdev->ipv4_gateway)) {
ERROR("failed to automatically find ipv4 gateway "
"address from link interface '%s'", netdev->link);
return -1;
}
}
if (netdev->ipv6_gateway_auto) {
if (lxc_ipv6_addr_get(link_index, &netdev->ipv6_gateway)) {
ERROR("failed to automatically find ipv6 gateway "
"address from link interface '%s'", netdev->link);
return -1;
}
}
}
return 0;
}
int lxc_create_tty(const char *name, struct lxc_conf *conf)
{
struct lxc_tty_info *tty_info = &conf->tty_info;
int i, ret;
/* no tty in the configuration */
if (!conf->tty)
return 0;
tty_info->pty_info =
malloc(sizeof(*tty_info->pty_info)*conf->tty);
if (!tty_info->pty_info) {
SYSERROR("failed to allocate pty_info");
return -1;
}
for (i = 0; i < conf->tty; i++) {
struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
process_lock();
ret = openpty(&pty_info->master, &pty_info->slave,
pty_info->name, NULL, NULL);
process_unlock();
if (ret) {
SYSERROR("failed to create pty #%d", i);
tty_info->nbtty = i;
lxc_delete_tty(tty_info);
return -1;
}
DEBUG("allocated pty '%s' (%d/%d)",
pty_info->name, pty_info->master, pty_info->slave);
/* Prevent leaking the file descriptors to the container */
fcntl(pty_info->master, F_SETFD, FD_CLOEXEC);
fcntl(pty_info->slave, F_SETFD, FD_CLOEXEC);
pty_info->busy = 0;
}
tty_info->nbtty = conf->tty;
INFO("tty's configured");
return 0;
}
void lxc_delete_tty(struct lxc_tty_info *tty_info)
{
int i;
for (i = 0; i < tty_info->nbtty; i++) {
struct lxc_pty_info *pty_info = &tty_info->pty_info[i];
close(pty_info->master);
close(pty_info->slave);
}
free(tty_info->pty_info);
tty_info->nbtty = 0;
}
/*
* chown_mapped_root: for an unprivileged user with uid X to chown a dir
* to subuid Y, he needs to run chown as root in a userns where
* nsid 0 is mapped to hostuid Y, and nsid Y is mapped to hostuid
* X. That way, the container root is privileged with respect to
* hostuid X, allowing him to do the chown.
*/
int chown_mapped_root(char *path, struct lxc_conf *conf)
{
uid_t rootid;
pid_t pid;
unsigned long val;
if (!get_mapped_rootid(conf, ID_TYPE_UID, &val)) {
ERROR("No mapping for container root");
return -1;
}
rootid = (uid_t) val;
if (geteuid() == 0) {
if (chown(path, rootid, -1) < 0) {
ERROR("Error chowning %s", path);
return -1;
}
return 0;
}
pid = fork();
if (pid < 0) {
SYSERROR("Failed forking");
return -1;
}
if (!pid) {
int hostuid = geteuid(), ret;
char map1[100], map2[100], map3[100];
char *args[] = {"lxc-usernsexec", "-m", map1, "-m", map2, "-m",
map3, "--", "chown", "0", path, NULL};
// "u:0:rootid:1"
ret = snprintf(map1, 100, "u:0:%d:1", rootid);
if (ret < 0 || ret >= 100) {
ERROR("Error uid printing map string");
return -1;
}
// "u:hostuid:hostuid:1"
ret = snprintf(map2, 100, "u:%d:%d:1", hostuid, hostuid);
if (ret < 0 || ret >= 100) {
ERROR("Error uid printing map string");
return -1;
}
// "g:0:hostgid:1"
ret = snprintf(map3, 100, "g:0:%d:1", getgid());
if (ret < 0 || ret >= 100) {
ERROR("Error uid printing map string");
return -1;
}
ret = execvp("lxc-usernsexec", args);
SYSERROR("Failed executing usernsexec");
exit(1);
}
return wait_for_pid(pid);
}
int ttys_shift_ids(struct lxc_conf *c)
{
int i;
if (lxc_list_empty(&c->id_map))
return 0;
for (i = 0; i < c->tty_info.nbtty; i++) {
struct lxc_pty_info *pty_info = &c->tty_info.pty_info[i];
if (chown_mapped_root(pty_info->name, c) < 0) {
ERROR("Failed to chown %s", pty_info->name);
return -1;
}
}
if (strcmp(c->console.name, "") !=0 && chown_mapped_root(c->console.name, c) < 0) {
ERROR("Failed to chown %s", c->console.name);
return -1;
}
return 0;
}
/*
* This routine is called when the configuration does not already specify a value
* for autodev (mounting a file system on /dev and populating it in a container).
* If a hard override value has not be specified, then we try to apply some
* heuristics to determine if we should switch to autodev mode.
*
* For instance, if the container has an /etc/systemd/system directory then it
* is probably running systemd as the init process and it needs the autodev
* mount to prevent it from mounting devtmpfs on /dev on it's own causing conflicts
* in the host.
*
* We may also want to enable autodev if the host has devtmpfs mounted on its
* /dev as this then enable us to use subdirectories under /dev for the container
* /dev directories and we can fake udev devices.
*/
struct start_args {
char *const *argv;
};
#define MAX_SYMLINK_DEPTH 32
static int check_autodev( const char *rootfs, void *data )
{
struct start_args *arg = data;
int ret;
int loop_count = 0;
struct stat s;
char absrootfs[MAXPATHLEN];
char path[MAXPATHLEN];
char abs_path[MAXPATHLEN];
char *command = "/sbin/init";
if (rootfs == NULL || strlen(rootfs) == 0)
return -2;
if (!realpath(rootfs, absrootfs))
return -2;
if( arg && arg->argv[0] ) {
command = arg->argv[0];
DEBUG("Set exec command to %s", command );
}
strncpy( path, command, MAXPATHLEN-1 );
if ( 0 != access(path, F_OK) || 0 != stat(path, &s) )
return -2;
/* Dereference down the symlink merry path testing as we go. */
/* If anything references systemd in the path - set autodev! */
/* Renormalize to the rootfs before each dereference */
/* Relative symlinks should fall out in the wash even with .. */
while( 1 ) {
if ( strstr( path, "systemd" ) ) {
INFO("Container with systemd init detected - enabling autodev!");
return 1;
}
ret = snprintf(abs_path, MAXPATHLEN-1, "%s/%s", absrootfs, path);
if (ret < 0 || ret > MAXPATHLEN)
return -2;
ret = readlink( abs_path, path, MAXPATHLEN-1 );
if ( ( ret <= 0 ) || ( ++loop_count > MAX_SYMLINK_DEPTH ) ) {
break; /* Break out for other tests */
}
path[ret] = '\0';
}
/*
* Add future checks here.
* Return positive if we should go autodev
* Return 0 if we should NOT go autodev
* Return negative if we encounter an error or can not determine...
*/
/* All else fails, we don't need autodev */
INFO("Autodev not required.");
return 0;
}
int lxc_setup(struct lxc_handler *handler)
{
const char *name = handler->name;
struct lxc_conf *lxc_conf = handler->conf;
const char *lxcpath = handler->lxcpath;
void *data = handler->data;
if (lxc_conf->inherit_ns_fd[LXC_NS_UTS] == -1) {
if (setup_utsname(lxc_conf->utsname)) {
ERROR("failed to setup the utsname for '%s'", name);
return -1;
}
}
if (setup_network(&lxc_conf->network)) {
ERROR("failed to setup the network for '%s'", name);
return -1;
}
if (run_lxc_hooks(name, "pre-mount", lxc_conf, lxcpath, NULL)) {
ERROR("failed to run pre-mount hooks for container '%s'.", name);
return -1;
}
if (setup_rootfs(lxc_conf)) {
ERROR("failed to setup rootfs for '%s'", name);
return -1;
}
if (lxc_conf->autodev < 0) {
lxc_conf->autodev = check_autodev(lxc_conf->rootfs.mount, data);
}
if (lxc_conf->autodev > 0) {
if (mount_autodev(name, lxc_conf->rootfs.mount, lxcpath)) {
ERROR("failed to mount /dev in the container");
return -1;
}
}
/* do automatic mounts (mainly /proc and /sys), but exclude
* those that need to wait until other stuff has finished
*/
if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & ~LXC_AUTO_CGROUP_MASK, handler) < 0) {
ERROR("failed to setup the automatic mounts for '%s'", name);
return -1;
}
if (setup_mount(&lxc_conf->rootfs, lxc_conf->fstab, name)) {
ERROR("failed to setup the mounts for '%s'", name);
return -1;
}
if (!lxc_list_empty(&lxc_conf->mount_list) && setup_mount_entries(&lxc_conf->rootfs, &lxc_conf->mount_list, name)) {
ERROR("failed to setup the mount entries for '%s'", name);
return -1;
}
/* now mount only cgroup, if wanted;
* before, /sys could not have been mounted
* (is either mounted automatically or via fstab entries)
*/
if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) {
ERROR("failed to setup the automatic mounts for '%s'", name);
return -1;
}
if (run_lxc_hooks(name, "mount", lxc_conf, lxcpath, NULL)) {
ERROR("failed to run mount hooks for container '%s'.", name);
return -1;
}
if (lxc_conf->autodev > 0) {
if (run_lxc_hooks(name, "autodev", lxc_conf, lxcpath, NULL)) {
ERROR("failed to run autodev hooks for container '%s'.", name);
return -1;
}
if (setup_autodev(lxc_conf->rootfs.mount)) {
ERROR("failed to populate /dev in the container");
return -1;
}
}
if (!lxc_conf->is_execute && setup_console(&lxc_conf->rootfs, &lxc_conf->console, lxc_conf->ttydir)) {
ERROR("failed to setup the console for '%s'", name);
return -1;
}
if (lxc_conf->kmsg) {
if (setup_kmsg(&lxc_conf->rootfs, &lxc_conf->console)) // don't fail
ERROR("failed to setup kmsg for '%s'", name);
}
if (!lxc_conf->is_execute && setup_tty(&lxc_conf->rootfs, &lxc_conf->tty_info, lxc_conf->ttydir)) {
ERROR("failed to setup the ttys for '%s'", name);
return -1;
}
/* mount /proc if needed for LSM transition */
if (lsm_proc_mount(lxc_conf) < 0) {
ERROR("failed to LSM mount proc for '%s'", name);
return -1;
}
if (setup_pivot_root(&lxc_conf->rootfs)) {
ERROR("failed to set rootfs for '%s'", name);
return -1;
}
if (setup_pts(lxc_conf->pts)) {
ERROR("failed to setup the new pts instance");
return -1;
}
if (setup_personality(lxc_conf->personality)) {
ERROR("failed to setup personality");
return -1;
}
if (lxc_list_empty(&lxc_conf->id_map)) {
if (!lxc_list_empty(&lxc_conf->keepcaps)) {
if (!lxc_list_empty(&lxc_conf->caps)) {
ERROR("Simultaneously requested dropping and keeping caps");
return -1;
}
if (dropcaps_except(&lxc_conf->keepcaps)) {
ERROR("failed to keep requested caps");
return -1;
}
} else if (setup_caps(&lxc_conf->caps)) {
ERROR("failed to drop capabilities");
return -1;
}
}
NOTICE("'%s' is setup.", name);
return 0;
}
int run_lxc_hooks(const char *name, char *hook, struct lxc_conf *conf,
const char *lxcpath, char *argv[])
{
int which = -1;
struct lxc_list *it;
if (strcmp(hook, "pre-start") == 0)
which = LXCHOOK_PRESTART;
else if (strcmp(hook, "pre-mount") == 0)
which = LXCHOOK_PREMOUNT;
else if (strcmp(hook, "mount") == 0)
which = LXCHOOK_MOUNT;
else if (strcmp(hook, "autodev") == 0)
which = LXCHOOK_AUTODEV;
else if (strcmp(hook, "start") == 0)
which = LXCHOOK_START;
else if (strcmp(hook, "post-stop") == 0)
which = LXCHOOK_POSTSTOP;
else if (strcmp(hook, "clone") == 0)
which = LXCHOOK_CLONE;
else
return -1;
lxc_list_for_each(it, &conf->hooks[which]) {
int ret;
char *hookname = it->elem;
ret = run_script_argv(name, "lxc", hookname, hook, lxcpath, argv);
if (ret)
return ret;
}
return 0;
}
static void lxc_remove_nic(struct lxc_list *it)
{
struct lxc_netdev *netdev = it->elem;
struct lxc_list *it2,*next;
lxc_list_del(it);
if (netdev->link)
free(netdev->link);
if (netdev->name)
free(netdev->name);
if (netdev->type == LXC_NET_VETH && netdev->priv.veth_attr.pair)
free(netdev->priv.veth_attr.pair);
if (netdev->upscript)
free(netdev->upscript);
if (netdev->hwaddr)
free(netdev->hwaddr);
if (netdev->mtu)
free(netdev->mtu);
if (netdev->ipv4_gateway)
free(netdev->ipv4_gateway);
if (netdev->ipv6_gateway)
free(netdev->ipv6_gateway);
lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
lxc_list_del(it2);
free(it2->elem);
free(it2);
}
lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
lxc_list_del(it2);
free(it2->elem);
free(it2);
}
free(netdev);
free(it);
}
/* we get passed in something like '0', '0.ipv4' or '1.ipv6' */
int lxc_clear_nic(struct lxc_conf *c, const char *key)
{
char *p1;
int ret, idx, i;
struct lxc_list *it;
struct lxc_netdev *netdev;
p1 = index(key, '.');
if (!p1 || *(p1+1) == '\0')
p1 = NULL;
ret = sscanf(key, "%d", &idx);
if (ret != 1) return -1;
if (idx < 0)
return -1;
i = 0;
lxc_list_for_each(it, &c->network) {
if (i == idx)
break;
i++;
}
if (i < idx) // we don't have that many nics defined
return -1;
if (!it || !it->elem)
return -1;
netdev = it->elem;
if (!p1) {
lxc_remove_nic(it);
} else if (strcmp(p1, ".ipv4") == 0) {
struct lxc_list *it2,*next;
lxc_list_for_each_safe(it2, &netdev->ipv4, next) {
lxc_list_del(it2);
free(it2->elem);
free(it2);
}
} else if (strcmp(p1, ".ipv6") == 0) {
struct lxc_list *it2,*next;
lxc_list_for_each_safe(it2, &netdev->ipv6, next) {
lxc_list_del(it2);
free(it2->elem);
free(it2);
}
} else if (strcmp(p1, ".link") == 0) {
if (netdev->link) {
free(netdev->link);
netdev->link = NULL;
}
} else if (strcmp(p1, ".name") == 0) {
if (netdev->name) {
free(netdev->name);
netdev->name = NULL;
}
} else if (strcmp(p1, ".script.up") == 0) {
if (netdev->upscript) {
free(netdev->upscript);
netdev->upscript = NULL;
}
} else if (strcmp(p1, ".hwaddr") == 0) {
if (netdev->hwaddr) {
free(netdev->hwaddr);
netdev->hwaddr = NULL;
}
} else if (strcmp(p1, ".mtu") == 0) {
if (netdev->mtu) {
free(netdev->mtu);
netdev->mtu = NULL;
}
} else if (strcmp(p1, ".ipv4_gateway") == 0) {
if (netdev->ipv4_gateway) {
free(netdev->ipv4_gateway);
netdev->ipv4_gateway = NULL;
}
} else if (strcmp(p1, ".ipv6_gateway") == 0) {
if (netdev->ipv6_gateway) {
free(netdev->ipv6_gateway);
netdev->ipv6_gateway = NULL;
}
}
else return -1;
return 0;
}
int lxc_clear_config_network(struct lxc_conf *c)
{
struct lxc_list *it,*next;
lxc_list_for_each_safe(it, &c->network, next) {
lxc_remove_nic(it);
}
return 0;
}
int lxc_clear_config_caps(struct lxc_conf *c)
{
struct lxc_list *it,*next;
lxc_list_for_each_safe(it, &c->caps, next) {
lxc_list_del(it);
free(it->elem);
free(it);
}
return 0;
}
static int lxc_free_idmap(struct lxc_list *id_map) {
struct lxc_list *it, *next;
lxc_list_for_each_safe(it, id_map, next) {
lxc_list_del(it);
free(it->elem);
free(it);
}
return 0;
}
int lxc_clear_idmaps(struct lxc_conf *c)
{
return lxc_free_idmap(&c->id_map);
}
int lxc_clear_config_keepcaps(struct lxc_conf *c)
{
struct lxc_list *it,*next;
lxc_list_for_each_safe(it, &c->keepcaps, next) {
lxc_list_del(it);
free(it->elem);
free(it);
}
return 0;
}
int lxc_clear_cgroups(struct lxc_conf *c, const char *key)
{
struct lxc_list *it,*next;
bool all = false;
const char *k = key + 11;
if (strcmp(key, "lxc.cgroup") == 0)
all = true;
lxc_list_for_each_safe(it, &c->cgroup, next) {
struct lxc_cgroup *cg = it->elem;
if (!all && strcmp(cg->subsystem, k) != 0)
continue;
lxc_list_del(it);
free(cg->subsystem);
free(cg->value);
free(cg);
free(it);
}
return 0;
}
int lxc_clear_groups(struct lxc_conf *c)
{
struct lxc_list *it,*next;
lxc_list_for_each_safe(it, &c->groups, next) {
lxc_list_del(it);
free(it->elem);
free(it);
}
return 0;
}
int lxc_clear_mount_entries(struct lxc_conf *c)
{
struct lxc_list *it,*next;
lxc_list_for_each_safe(it, &c->mount_list, next) {
lxc_list_del(it);
free(it->elem);
free(it);
}
return 0;
}
int lxc_clear_hooks(struct lxc_conf *c, const char *key)
{
struct lxc_list *it,*next;
bool all = false, done = false;
const char *k = key + 9;
int i;
if (strcmp(key, "lxc.hook") == 0)
all = true;
for (i=0; i<NUM_LXC_HOOKS; i++) {
if (all || strcmp(k, lxchook_names[i]) == 0) {
lxc_list_for_each_safe(it, &c->hooks[i], next) {
lxc_list_del(it);
free(it->elem);
free(it);
}
done = true;
}
}
if (!done) {
ERROR("Invalid hook key: %s", key);
return -1;
}
return 0;
}
static void lxc_clear_saved_nics(struct lxc_conf *conf)
{
int i;
if (!conf->num_savednics)
return;
for (i=0; i < conf->num_savednics; i++)
free(conf->saved_nics[i].orig_name);
conf->saved_nics = 0;
free(conf->saved_nics);
}
void lxc_conf_free(struct lxc_conf *conf)
{
if (!conf)
return;
if (conf->console.path)
free(conf->console.path);
if (conf->rootfs.mount)
free(conf->rootfs.mount);
if (conf->rootfs.options)
free(conf->rootfs.options);
if (conf->rootfs.path)
free(conf->rootfs.path);
if (conf->rootfs.pivot)
free(conf->rootfs.pivot);
if (conf->logfile)
free(conf->logfile);
if (conf->utsname)
free(conf->utsname);
if (conf->ttydir)
free(conf->ttydir);
if (conf->fstab)
free(conf->fstab);
if (conf->rcfile)
free(conf->rcfile);
lxc_clear_config_network(conf);
if (conf->lsm_aa_profile)
free(conf->lsm_aa_profile);
if (conf->lsm_se_context)
free(conf->lsm_se_context);
lxc_seccomp_free(conf);
lxc_clear_config_caps(conf);
lxc_clear_config_keepcaps(conf);
lxc_clear_cgroups(conf, "lxc.cgroup");
lxc_clear_hooks(conf, "lxc.hook");
lxc_clear_mount_entries(conf);
lxc_clear_saved_nics(conf);
lxc_clear_idmaps(conf);
lxc_clear_groups(conf);
free(conf);
}
struct userns_fn_data {
int (*fn)(void *);
void *arg;
int p[2];
};
static int run_userns_fn(void *data)
{
struct userns_fn_data *d = data;
char c;
// we're not sharing with the parent any more, if it was a thread
close(d->p[1]);
if (read(d->p[0], &c, 1) != 1)
return -1;
close(d->p[0]);
return d->fn(d->arg);
}
/*
* Add a ID_TYPE_UID entry to an existing lxc_conf, if it is not
* alread there.
* We may want to generalize this to do gids as well as uids, but right now
* it's not necessary.
*/
static struct lxc_list *idmap_add_id(struct lxc_conf *conf, uid_t uid)
{
int hostid_mapped = mapped_hostid(uid, conf, ID_TYPE_UID);
struct lxc_list *new = NULL, *tmp, *it, *next;
struct id_map *entry;
new = malloc(sizeof(*new));
if (!new) {
ERROR("Out of memory building id map");
return NULL;
}
lxc_list_init(new);
if (hostid_mapped < 0) {
hostid_mapped = find_unmapped_nsuid(conf, ID_TYPE_UID);
if (hostid_mapped < 0)
goto err;
tmp = malloc(sizeof(*tmp));
if (!tmp)
goto err;
entry = malloc(sizeof(*entry));
if (!entry) {
free(tmp);
goto err;
}
tmp->elem = entry;
entry->idtype = ID_TYPE_UID;
entry->nsid = hostid_mapped;
entry->hostid = (unsigned long)uid;
entry->range = 1;
lxc_list_add_tail(new, tmp);
}
lxc_list_for_each_safe(it, &conf->id_map, next) {
tmp = malloc(sizeof(*tmp));
if (!tmp)
goto err;
entry = malloc(sizeof(*entry));
if (!entry) {
free(tmp);
goto err;
}
memset(entry, 0, sizeof(*entry));
memcpy(entry, it->elem, sizeof(*entry));
tmp->elem = entry;
lxc_list_add_tail(new, tmp);
}
return new;
err:
ERROR("Out of memory building a new uid map");
if (new)
lxc_free_idmap(new);
free(new);
return NULL;
}
/*
* Run a function in a new user namespace.
* The caller's euid will be mapped in if it is not already.
*/
int userns_exec_1(struct lxc_conf *conf, int (*fn)(void *), void *data)
{
int ret, pid;
struct userns_fn_data d;
char c = '1';
int p[2];
struct lxc_list *idmap;
ret = pipe(p);
if (ret < 0) {
SYSERROR("opening pipe");
return -1;
}
d.fn = fn;
d.arg = data;
d.p[0] = p[0];
d.p[1] = p[1];
pid = lxc_clone(run_userns_fn, &d, CLONE_NEWUSER);
if (pid < 0)
goto err;
close(p[0]);
p[0] = -1;
if ((idmap = idmap_add_id(conf, geteuid())) == NULL) {
ERROR("Error adding self to container uid map");
goto err;
}
ret = lxc_map_ids(idmap, pid);
lxc_free_idmap(idmap);
free(idmap);
if (ret) {
ERROR("Error setting up child mappings");
goto err;
}
// kick the child
if (write(p[1], &c, 1) != 1) {
SYSERROR("writing to pipe to child");
goto err;
}
ret = wait_for_pid(pid);
close(p[1]);
return ret;
err:
if (p[0] != -1)
close(p[0]);
close(p[1]);
return -1;
}