diff --git a/doc/lxc.container.conf.sgml.in b/doc/lxc.container.conf.sgml.in
index 1e702c717..18ad924a3 100644
--- a/doc/lxc.container.conf.sgml.in
+++ b/doc/lxc.container.conf.sgml.in
@@ -1593,6 +1593,22 @@ mknod errno 0
+
+
+
+
+
+
+
+ If unset, then this version of lxc is not aware of cgroup
+ namespaces. If set, it will be set to 1, and lxc is aware
+ of cgroup namespaces. Note this does not guarantee that
+ cgroup namespaces are enabled in the kernel. This is used
+ by the lxcfs mount hook.
+
+
+
+
Logging
diff --git a/src/lxc/attach.c b/src/lxc/attach.c
index 13989e863..2dc9a0725 100644
--- a/src/lxc/attach.c
+++ b/src/lxc/attach.c
@@ -957,6 +957,13 @@ int lxc_attach(const char* name, const char* lxcpath, lxc_attach_exec_t exec_fun
WARN("could not change directory to '%s'", new_cwd);
free(cwd);
+ if (options->attach_flags & LXC_ATTACH_MOVE_TO_CGROUP && cgns_supported()) {
+ if (unshare(CLONE_NEWCGROUP) != 0) {
+ SYSERROR("cgroupns unshare: permission denied");
+ rexit(-1);
+ }
+ }
+
/* now create the real child process */
{
struct attach_clone_payload payload = {
diff --git a/src/lxc/cgmanager.c b/src/lxc/cgmanager.c
index d69eb3dd8..559628574 100644
--- a/src/lxc/cgmanager.c
+++ b/src/lxc/cgmanager.c
@@ -210,11 +210,6 @@ static void check_supports_multiple_controllers(pid_t pid)
cgm_supports_multiple_controllers = false;
cgm_all_controllers_same = false;
- if (api_version < CGM_SUPPORTS_MULT_CONTROLLERS) {
- cgm_supports_multiple_controllers = false;
- return;
- }
-
cgm_supports_multiple_controllers = true;
if (pid == -1)
@@ -544,17 +539,13 @@ static void *cgm_init(const char *name)
{
struct cgm_data *d;
+ d = malloc(sizeof(*d));
+ if (!d)
+ return NULL;
+
if (!cgm_dbus_connect()) {
ERROR("Error connecting to cgroup manager");
- return NULL;
- }
-
- check_supports_multiple_controllers(-1);
-
- d = malloc(sizeof(*d));
- if (!d) {
- cgm_dbus_disconnect();
- return NULL;
+ goto err1;
}
memset(d, 0, sizeof(*d));
@@ -1132,6 +1123,9 @@ static void cull_user_controllers(void)
}
}
+/*
+ * return true if inword is in the comma-delimited list cgroup_use
+ */
static bool in_comma_list(const char *inword, const char *cgroup_use)
{
char *e;
@@ -1148,6 +1142,23 @@ static bool in_comma_list(const char *inword, const char *cgroup_use)
return false;
}
+/*
+ * inlist is a comma-delimited list of cgroups; so is checklist. Return
+ * true if any member of inlist is in checklist.
+ */
+static bool any_in_comma_list(const char *inlist, const char *checklist)
+{
+ char *tmp = alloca(strlen(inlist) + 1), *tok, *saveptr = NULL;
+
+ strcpy(tmp, inlist);
+ for (tok = strtok_r(tmp, ",", &saveptr); tok; tok = strtok_r(NULL, ",", &saveptr)) {
+ if (in_comma_list(tok, checklist))
+ return true;
+ }
+
+ return false;
+}
+
static bool in_subsystem_list(const char *c)
{
int i;
@@ -1202,6 +1213,132 @@ static bool verify_and_prune(const char *cgroup_use)
return true;
}
+static void drop_subsystem(int which)
+{
+ int i;
+
+ if (which < 0 || which >= nr_subsystems) {
+ ERROR("code error: dropping invalid subsystem index\n");
+ exit(1);
+ }
+
+ free(subsystems[which]);
+ /* note - we have nr_subsystems+1 entries, last one a NULL */
+ for (i = which; i < nr_subsystems; i++)
+ subsystems[i] = subsystems[i+1];
+ nr_subsystems -= 1;
+}
+
+/*
+ * Check whether we can create the cgroups we would want
+ */
+static bool subsys_is_writeable(const char *controller, const char *probe)
+{
+ int32_t existed;
+ bool ret = true;
+
+ if ( cgmanager_create_sync(NULL, cgroup_manager, controller,
+ probe, &existed) != 0) {
+ NihError *nerr;
+ nerr = nih_error_get();
+ ERROR("call to cgmanager_create_sync failed: %s", nerr->message);
+ nih_free(nerr);
+ ERROR("Failed to create %s:%s", controller, probe);
+ ret = false;
+ }
+
+ return ret;
+}
+
+/*
+ * Return true if this is a subsystem which we cannot do
+ * without
+ */
+static bool is_crucial_subsys(const char *s)
+{
+ if (strcmp(s, "systemd") == 0)
+ return true;
+ if (strcmp(s, "name=systemd") == 0)
+ return true;
+ if (strcmp(s, "freezer") == 0)
+ return true;
+ return false;
+}
+
+static char *get_last_controller_in_list(char *list)
+{
+ char *p;
+
+ while ((p = strchr(list, ',')) != NULL)
+ list = p + 1;
+
+ return list;
+}
+
+/*
+ * Make sure that all the controllers are writeable.
+ * If any are not, then
+ * - if they are listed in lxc.cgroup.use, refuse to start
+ * - else if they are crucial subsystems, refuse to start
+ * - else warn and do not use them
+ */
+static bool verify_final_subsystems(const char *cgroup_use)
+{
+ int i = 0;
+ bool dropped_any = false;
+ bool ret = false;
+ const char *cgroup_pattern;
+ char tmpnam[50], *probe;
+
+ if (!cgm_dbus_connect()) {
+ ERROR("Error connecting to cgroup manager");
+ return false;
+ }
+
+ cgroup_pattern = lxc_global_config_value("lxc.cgroup.pattern");
+ i = snprintf(tmpnam, 50, "lxcprobe-%d", getpid());
+ if (i < 0 || i >= 50) {
+ ERROR("Attack - format string modified?");
+ return false;
+ }
+ probe = lxc_string_replace("%n", tmpnam, cgroup_pattern);
+ if (!probe)
+ goto out;
+
+ while (i < nr_subsystems) {
+ char *p = get_last_controller_in_list(subsystems[i]);
+
+ if (!subsys_is_writeable(p, probe)) {
+ if (is_crucial_subsys(p)) {
+ ERROR("Cannot write to crucial subsystem %s\n",
+ subsystems[i]);
+ goto out;
+ }
+ if (cgroup_use && any_in_comma_list(subsystems[i], cgroup_use)) {
+ ERROR("Cannot write to subsystem %s which is requested in lxc.cgroup.use\n",
+ subsystems[i]);
+ goto out;
+ }
+ WARN("Cannot write to subsystem %s, continuing with out it\n",
+ subsystems[i]);
+ dropped_any = true;
+ drop_subsystem(i);
+ } else {
+ cgm_remove_cgroup(subsystems[i], probe);
+ i++;
+ }
+ }
+
+ if (dropped_any)
+ cgm_all_controllers_same = false;
+ ret = true;
+
+out:
+ free(probe);
+ cgm_dbus_disconnect();
+ return ret;
+}
+
static bool collect_subsytems(void)
{
char *line = NULL;
@@ -1285,7 +1422,7 @@ collected:
/* make sure that cgroup.use can be and is honored */
const char *cgroup_use = lxc_global_config_value("lxc.cgroup.use");
if (!cgroup_use && errno != 0)
- goto out_good;
+ goto final_verify;
if (cgroup_use) {
if (!verify_and_prune(cgroup_use)) {
free_subsystems();
@@ -1295,8 +1432,8 @@ collected:
cgm_all_controllers_same = false;
}
-out_good:
- return true;
+final_verify:
+ return verify_final_subsystems(cgroup_use);
out_free:
free(line);
@@ -1313,23 +1450,20 @@ out_free:
*/
struct cgroup_ops *cgm_ops_init(void)
{
+ check_supports_multiple_controllers(-1);
if (!collect_subsytems())
return NULL;
- if (!cgm_dbus_connect())
- goto err1;
- // root; try to escape to root cgroup
- if (geteuid() == 0 && !cgm_escape())
- goto err2;
- cgm_dbus_disconnect();
+ if (api_version < CGM_SUPPORTS_MULT_CONTROLLERS)
+ cgm_supports_multiple_controllers = false;
+
+ // if root, try to escape to root cgroup
+ if (geteuid() == 0 && !cgm_escape()) {
+ free_subsystems();
+ return NULL;
+ }
return &cgmanager_ops;
-
-err2:
- cgm_dbus_disconnect();
-err1:
- free_subsystems();
- return NULL;
}
/* unfreeze is called by the command api after killing a container. */
diff --git a/src/lxc/namespace.h b/src/lxc/namespace.h
index 28f17e687..027c76588 100644
--- a/src/lxc/namespace.h
+++ b/src/lxc/namespace.h
@@ -34,6 +34,9 @@
#ifndef CLONE_NEWNS
# define CLONE_NEWNS 0x00020000
#endif
+#ifndef CLONE_NEWCGROUP
+# define CLONE_NEWCGROUP 0x02000000
+#endif
#ifndef CLONE_NEWUTS
# define CLONE_NEWUTS 0x04000000
#endif
diff --git a/src/lxc/start.c b/src/lxc/start.c
index 79dbe335d..0d91eb394 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -451,6 +451,9 @@ struct lxc_handler *lxc_init(const char *name, struct lxc_conf *conf, const char
if (conf->console.log_path && setenv("LXC_CONSOLE_LOGPATH", conf->console.log_path, 1)) {
SYSERROR("failed to set environment variable for console log");
}
+ if (setenv("LXC_CGNS_AWARE", "1", 1)) {
+ SYSERROR("failed to set LXC_CGNS_AWARE environment variable");
+ }
/* End of environment variable setup for hooks */
if (run_lxc_hooks(name, "pre-start", conf, handler->lxcpath, NULL)) {
@@ -842,6 +845,11 @@ static int do_start(void *data)
if (handler->backgrounded && null_stdfds() < 0)
goto out_warn_father;
+ if (cgns_supported() && unshare(CLONE_NEWCGROUP) != 0) {
+ SYSERROR("Failed to unshare cgroup namespace");
+ goto out_warn_father;
+ }
+
/* after this call, we are in error because this
* ops should not return as it execs */
handler->ops->start(handler, handler->data);
diff --git a/src/lxc/utils.c b/src/lxc/utils.c
index ad9b0a294..ed8c4c42d 100644
--- a/src/lxc/utils.c
+++ b/src/lxc/utils.c
@@ -1185,6 +1185,11 @@ bool file_exists(const char *f)
return stat(f, &statbuf) == 0;
}
+bool cgns_supported(void)
+{
+ return file_exists("/proc/self/ns/cgroup");
+}
+
/* historically lxc-init has been under /usr/lib/lxc and under
* /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc.
*/
diff --git a/src/lxc/utils.h b/src/lxc/utils.h
index 059026f01..96ec45c20 100644
--- a/src/lxc/utils.h
+++ b/src/lxc/utils.h
@@ -273,6 +273,7 @@ int detect_shared_rootfs(void);
int detect_ramfs_rootfs(void);
char *on_path(char *cmd, const char *rootfs);
bool file_exists(const char *f);
+bool cgns_supported(void);
char *choose_init(const char *rootfs);
int print_to_file(const char *file, const char *content);
bool switch_to_ns(pid_t pid, const char *ns);