diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in index 34b8117bb..27b518e03 100644 --- a/doc/lxc.container.conf.sgml.in +++ b/doc/lxc.container.conf.sgml.in @@ -1126,36 +1126,75 @@ dev/null proc/kcore none bind,relative 0 0 /sys as read-write + : - mount a tmpfs to /sys/fs/cgroup, - create directories for all hierarchies to which - the container is added, create subdirectories - there with the name of the cgroup, and bind-mount - the container's own cgroup into that directory. - The container will be able to write to its own - cgroup directory, but not the parents, since they - will be remounted read-only. + Mount a tmpfs to /sys/fs/cgroup, + create directories for all hierarchies to which the container + is added, create subdirectories in those hierarchies with the + name of the cgroup, and bind-mount the container's own cgroup + into that directory. The container will be able to write to + its own cgroup directory, but not the parents, since they will + be remounted read-only. + - : similar to - , but everything will - be mounted read-only. + : + The option will cause LXC to perform + the cgroup mounts for the container under all circumstances. + Otherwise it is similar to . + This is mainly useful when the cgroup namespaces are enabled + where LXC will normally leave mounting cgroups to the init + binary of the container since it is perfectly safe to do so. + + + + : + similar to , but everything will + be mounted read-only. + + + + + + : + The option will cause LXC to perform + the cgroup mounts for the container under all circumstances. + Otherwise it is similar to . + This is mainly useful when the cgroup namespaces are enabled + where LXC will normally leave mounting cgroups to the init + binary of the container since it is perfectly safe to do so. + + + : similar to - , but everything will - be mounted read-write. Note that the paths leading - up to the container's own cgroup will be writable, - but will not be a cgroup filesystem but just part - of the tmpfs of /sys/fs/cgroup + , but everything will be mounted + read-write. Note that the paths leading up to the container's + own cgroup will be writable, but will not be a cgroup + filesystem but just part of the tmpfs of + /sys/fs/cgroup + + + + : + The option will cause LXC to perform + the cgroup mounts for the container under all circumstances. + Otherwise it is similar to . + This is mainly useful when the cgroup namespaces are enabled + where LXC will normally leave mounting cgroups to the init + binary of the container since it is perfectly safe to do so. + + + (without specifier): diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index 826ea600b..c13f7fa2f 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -2043,44 +2043,49 @@ static int cg_mount_in_cgroup_namespace(int type, struct hierarchy *h, static bool cgfsng_mount(void *hdata, const char *root, int type) { - int i; + int i, ret; char *tmpfspath = NULL; bool retval = false; struct lxc_handler *handler = hdata; struct cgfsng_handler_data *d = handler->cgroup_data; - bool has_cgns = false, has_sys_admin = true; + bool has_cgns = false, wants_force_mount = false; if ((type & LXC_AUTO_CGROUP_MASK) == 0) return true; + if (type & LXC_AUTO_CGROUP_FORCE) { + type &= ~LXC_AUTO_CGROUP_FORCE; + wants_force_mount = true; + } + + if (!wants_force_mount){ + if (!lxc_list_empty(&handler->conf->keepcaps)) + wants_force_mount = !in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps); + else + wants_force_mount = in_caplist(CAP_SYS_ADMIN, &handler->conf->caps); + } + has_cgns = cgns_supported(); - if (!lxc_list_empty(&handler->conf->keepcaps)) - has_sys_admin = in_caplist(CAP_SYS_ADMIN, &handler->conf->keepcaps); - else - has_sys_admin = !in_caplist(CAP_SYS_ADMIN, &handler->conf->caps); - - if (has_cgns && has_sys_admin) + if (has_cgns && !wants_force_mount) return true; - tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL); - if (type == LXC_AUTO_CGROUP_NOSPEC) type = LXC_AUTO_CGROUP_MIXED; else if (type == LXC_AUTO_CGROUP_FULL_NOSPEC) type = LXC_AUTO_CGROUP_FULL_MIXED; /* Mount tmpfs */ - if (safe_mount("cgroup_root", tmpfspath, "tmpfs", - MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_RELATIME, - "size=10240k,mode=755", - root) < 0) - goto bad; + tmpfspath = must_make_path(root, "/sys/fs/cgroup", NULL); + ret = safe_mount("cgroup_root", tmpfspath, "tmpfs", + MS_NOSUID | MS_NODEV | MS_NOEXEC | MS_RELATIME, + "size=10240k,mode=755", root); + if (ret < 0) + goto on_error; for (i = 0; hierarchies[i]; i++) { char *controllerpath, *path2; struct hierarchy *h = hierarchies[i]; char *controller = strrchr(h->mountpoint, '/'); - int r; if (!controller) continue; @@ -2090,49 +2095,56 @@ static bool cgfsng_mount(void *hdata, const char *root, int type) free(controllerpath); continue; } - if (mkdir(controllerpath, 0755) < 0) { + ret = mkdir(controllerpath, 0755); + if (ret < 0) { SYSERROR("Error creating cgroup path: %s", controllerpath); free(controllerpath); - goto bad; + goto on_error; } - if (has_cgns && !has_sys_admin) { + if (has_cgns && wants_force_mount) { /* If cgroup namespaces are supported but the container * will not have CAP_SYS_ADMIN after it has started we * need to mount the cgroups manually. */ - r = cg_mount_in_cgroup_namespace(type, h, controllerpath); + ret = cg_mount_in_cgroup_namespace(type, h, controllerpath); free(controllerpath); - if (r < 0) - goto bad; + if (ret < 0) + goto on_error; + continue; } - if (mount_cgroup_full(type, h, controllerpath, d->container_cgroup) < 0) { + ret = mount_cgroup_full(type, h, controllerpath, d->container_cgroup); + if (ret < 0) { free(controllerpath); - goto bad; + goto on_error; } + if (!cg_mount_needs_subdirs(type)) { free(controllerpath); continue; } - path2 = must_make_path(controllerpath, h->base_cgroup, d->container_cgroup, NULL); - if (mkdir_p(path2, 0755) < 0) { + + path2 = must_make_path(controllerpath, h->base_cgroup, + d->container_cgroup, NULL); + ret = mkdir_p(path2, 0755); + if (ret < 0) { free(controllerpath); free(path2); - goto bad; + goto on_error; } - r = do_secondstage_mounts_if_needed(type, h, controllerpath, path2, - d->container_cgroup); + ret = do_secondstage_mounts_if_needed( + type, h, controllerpath, path2, d->container_cgroup); free(controllerpath); free(path2); - if (r < 0) - goto bad; + if (ret < 0) + goto on_error; } retval = true; -bad: +on_error: free(tmpfspath); return retval; } diff --git a/src/lxc/conf.c b/src/lxc/conf.c index 01f11422a..98d8d3871 100644 --- a/src/lxc/conf.c +++ b/src/lxc/conf.c @@ -715,7 +715,7 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha if (flags & LXC_AUTO_CGROUP_MASK) { int cg_flags; - cg_flags = flags & LXC_AUTO_CGROUP_MASK; + cg_flags = flags & (LXC_AUTO_CGROUP_MASK & ~LXC_AUTO_CGROUP_FORCE); /* If the type of cgroup mount was not specified, it depends on the * container's capabilities as to what makes sense: if we have * CAP_SYS_ADMIN, the read-only part can be remounted read-write @@ -737,7 +737,8 @@ static int lxc_mount_auto_mounts(struct lxc_conf *conf, int flags, struct lxc_ha else cg_flags = has_sys_admin ? LXC_AUTO_CGROUP_FULL_RW : LXC_AUTO_CGROUP_FULL_MIXED; } - + if (flags & LXC_AUTO_CGROUP_FORCE) + cg_flags |= LXC_AUTO_CGROUP_FORCE; if (!cgroup_mount(conf->rootfs.path ? conf->rootfs.mount : "", handler, cg_flags)) { SYSERROR("error mounting /sys/fs/cgroup"); return -1; @@ -3343,7 +3344,7 @@ int lxc_setup(struct lxc_handler *handler) * before, /sys could not have been mounted * (is either mounted automatically or via fstab entries) */ - if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & LXC_AUTO_CGROUP_MASK, handler) < 0) { + if (lxc_mount_auto_mounts(lxc_conf, lxc_conf->auto_mounts & (LXC_AUTO_CGROUP_MASK), handler) < 0) { ERROR("failed to setup the automatic mounts for '%s'", name); return -1; } diff --git a/src/lxc/conf.h b/src/lxc/conf.h index c5f27336a..388c0518c 100644 --- a/src/lxc/conf.h +++ b/src/lxc/conf.h @@ -233,9 +233,9 @@ enum { * variants, which is safe. */ LXC_AUTO_CGROUP_NOSPEC = 0x0B0, /* /sys/fs/cgroup (partial mount, r/w or mixed, depending on caps) */ LXC_AUTO_CGROUP_FULL_NOSPEC = 0x0E0, /* /sys/fs/cgroup (full mount, r/w or mixed, depending on caps) */ - LXC_AUTO_CGROUP_MASK = 0x0F0, - - LXC_AUTO_ALL_MASK = 0x0FF, /* all known settings */ + LXC_AUTO_CGROUP_FORCE = 0x100, /* mount cgroups even when cgroup namespaces are supported */ + LXC_AUTO_CGROUP_MASK = 0x1F0, /* all known cgroup options, doe not contain LXC_AUTO_CGROUP_FORCE */ + LXC_AUTO_ALL_MASK = 0x1FF, /* all known settings */ }; /* diff --git a/src/lxc/confile.c b/src/lxc/confile.c index 66b7615fe..da90b1982 100644 --- a/src/lxc/confile.c +++ b/src/lxc/confile.c @@ -1706,26 +1706,30 @@ static int set_config_mount_auto(const char *key, const char *value, int mask; int flag; } allowed_auto_mounts[] = { - { "proc", LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED }, - { "proc:mixed", LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED }, - { "proc:rw", LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW }, - { "sys", LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED }, - { "sys:ro", LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO }, - { "sys:mixed", LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED }, - { "sys:rw", LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW }, - { "cgroup", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_NOSPEC }, - { "cgroup:mixed", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_MIXED }, - { "cgroup:ro", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RO }, - { "cgroup:rw", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RW }, - { "cgroup-full", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_NOSPEC }, - { "cgroup-full:mixed", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_MIXED }, - { "cgroup-full:ro", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_RO }, - { "cgroup-full:rw", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_RW }, + { "proc", LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED }, + { "proc:mixed", LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_MIXED }, + { "proc:rw", LXC_AUTO_PROC_MASK, LXC_AUTO_PROC_RW }, + { "sys", LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED }, + { "sys:ro", LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RO }, + { "sys:mixed", LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_MIXED }, + { "sys:rw", LXC_AUTO_SYS_MASK, LXC_AUTO_SYS_RW }, + { "cgroup", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_NOSPEC }, + { "cgroup:mixed", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_MIXED }, + { "cgroup:ro", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RO }, + { "cgroup:rw", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RW }, + { "cgroup:force", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_NOSPEC | LXC_AUTO_CGROUP_FORCE }, + { "cgroup:mixed:force", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_MIXED | LXC_AUTO_CGROUP_FORCE }, + { "cgroup:ro:force", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RO | LXC_AUTO_CGROUP_FORCE }, + { "cgroup:rw:force", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_RW | LXC_AUTO_CGROUP_FORCE }, + { "cgroup-full", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_NOSPEC }, + { "cgroup-full:mixed", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_MIXED }, + { "cgroup-full:ro", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_RO }, + { "cgroup-full:rw", LXC_AUTO_CGROUP_MASK, LXC_AUTO_CGROUP_FULL_RW }, /* For adding anything that is just a single on/off, but has no - * options: keep mask and flag identical and just define the enum - * value as an unused bit so far + * options: keep mask and flag identical and just define the enum + * value as an unused bit so far */ - { NULL, 0, 0 } + { NULL, 0, 0 } }; if (lxc_config_value_empty(value)) {