From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 From: Wolfgang Bumiller Date: Wed, 28 Mar 2018 13:37:28 +0200 Subject: [PATCH] PVE: [Up] separate the limiting from the namespaced cgroup root When cgroup namespaces are enabled a privileged container with mixed cgroups has full write access to its own root cgroup effectively allowing it to overwrite values written from the outside or configured via lxc.cgroup.*. This patch causes an additional 'ns/' directory to be created in all cgroups if cgroup namespaces and cgfsng are being used in order to combat this. Signed-off-by: Wolfgang Bumiller --- src/lxc/cgroups/cgfsng.c | 94 +++++++++++++++++++++++++++++++++------- src/lxc/cgroups/cgroup.h | 18 ++++++-- src/lxc/commands.c | 87 ++++++++++++++++++++++++++++--------- src/lxc/commands.h | 2 + src/lxc/criu.c | 4 +- src/lxc/start.c | 28 +++++++++--- 6 files changed, 183 insertions(+), 50 deletions(-) diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c index ab99b47c5..ac8f469bb 100644 --- a/src/lxc/cgroups/cgfsng.c +++ b/src/lxc/cgroups/cgfsng.c @@ -818,6 +818,7 @@ static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char new->mountpoint = mountpoint; new->container_base_path = container_base_path; new->container_full_path = NULL; + new->container_inner_path = NULL; new->monitor_full_path = NULL; new->version = type; @@ -1059,6 +1060,9 @@ static int cgroup_rmdir(struct hierarchy **hierarchies, free(h->container_full_path); h->container_full_path = NULL; + + free(h->container_inner_path); + h->container_inner_path = NULL; } return 0; @@ -1070,6 +1074,7 @@ struct generic_userns_exec_data { struct lxc_conf *conf; uid_t origuid; /* target uid in parent namespace */ char *path; + bool inner; }; static int cgroup_rmdir_wrapper(void *data) @@ -1112,6 +1117,7 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops, wrap.container_cgroup = ops->container_cgroup; wrap.hierarchies = ops->hierarchies; wrap.conf = handler->conf; + wrap.inner = false; if (handler->conf && !lxc_list_empty(&handler->conf->id_map)) ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap, @@ -1323,17 +1329,26 @@ static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname) return cg_unified_create_cgroup(h, cgname); } -static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname) +static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname, bool inner) { int ret; + char *path; - if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) { + if (!inner && !cg_legacy_handle_cpuset_hierarchy(h, cgname)) { ERROR("Failed to handle legacy cpuset controller"); return false; } - h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL); - ret = mkdir_eexist_on_last(h->container_full_path, 0755); + if (inner) { + path = must_make_path(h->container_full_path, CGROUP_NAMESPACE_SUBDIR, NULL); + h->container_inner_path = path; + ret = mkdir(path, 0755); + } else { + path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL); + h->container_full_path = path; + ret = mkdir_eexist_on_last(path, 0755); + } + if (ret < 0) { ERROR("Failed to create cgroup \"%s\"", h->container_full_path); return false; @@ -1425,11 +1440,29 @@ on_error: return bret; } +static inline bool cgfsng_create_inner(struct cgroup_ops *ops) +{ + size_t i; + bool ret = true; + char *cgname = must_make_path(ops->container_cgroup, CGROUP_NAMESPACE_SUBDIR, NULL); + for (i = 0; ops->hierarchies[i]; i++) { + if (!container_create_path_for_hierarchy(ops->hierarchies[i], cgname, true)) { + SYSERROR("Failed to create %s namespace subdirectory: %s", + ops->hierarchies[i]->container_full_path, strerror(errno)); + ret = false; + break; + } + } + free(cgname); + return ret; +} + /* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern; * next cgroup_pattern-1, -2, ..., -999. */ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops, - struct lxc_handler *handler) + struct lxc_handler *handler, + bool inner) { int i; size_t len; @@ -1438,10 +1471,17 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops, struct lxc_conf *conf = handler->conf; if (ops->container_cgroup) { + if (inner) + return cgfsng_create_inner(ops); WARN("cgfsng_create called a second time: %s", ops->container_cgroup); return false; } + if (inner) { + ERROR("cgfsng_create called twice for inner cgroup"); + return false; + } + if (!conf) return false; @@ -1482,7 +1522,7 @@ again: } for (i = 0; ops->hierarchies[i]; i++) { - if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) { + if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup, false)) { ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path); free(ops->hierarchies[i]->container_full_path); ops->hierarchies[i]->container_full_path = NULL; @@ -1505,7 +1545,8 @@ out_free: } __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid, - bool monitor) + bool monitor, + bool inner) { int len; char pidstr[INTTYPE_TO_STRLEN(pid_t)]; @@ -1521,6 +1562,9 @@ __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid, if (monitor) path = must_make_path(ops->hierarchies[i]->monitor_full_path, "cgroup.procs", NULL); + else if (inner) + path = must_make_path(ops->hierarchies[i]->container_inner_path, + "cgroup.procs", NULL); else path = must_make_path(ops->hierarchies[i]->container_full_path, "cgroup.procs", NULL); @@ -1538,12 +1582,12 @@ __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid, __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid) { - return __do_cgroup_enter(ops, pid, true); + return __do_cgroup_enter(ops, pid, true, false); } -static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid) +static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid, bool inner) { - return __do_cgroup_enter(ops, pid, false); + return __do_cgroup_enter(ops, pid, false, inner); } static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid, @@ -1609,9 +1653,15 @@ static int chown_cgroup_wrapper(void *data) char *fullpath; char *path = arg->hierarchies[i]->container_full_path; + if (arg->inner) + path = must_make_path(path, CGROUP_NAMESPACE_SUBDIR, NULL); + ret = chowmod(path, destuid, nsgid, 0775); - if (ret < 0) + if (ret < 0) { + if (arg->inner) + free(path); return -1; + } /* Failures to chown() these are inconvenient but not * detrimental We leave these owned by the container launcher, @@ -1630,8 +1680,11 @@ static int chown_cgroup_wrapper(void *data) (void)chowmod(fullpath, destuid, nsgid, 0664); free(fullpath); - if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC) + if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC) { + if (arg->inner) + free(path); continue; + } fullpath = must_make_path(path, "cgroup.subtree_control", NULL); (void)chowmod(fullpath, destuid, nsgid, 0664); @@ -1640,13 +1693,17 @@ static int chown_cgroup_wrapper(void *data) fullpath = must_make_path(path, "cgroup.threads", NULL); (void)chowmod(fullpath, destuid, nsgid, 0664); free(fullpath); + + if (arg->inner) + free(path); } return 0; } __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops, - struct lxc_conf *conf) + struct lxc_conf *conf, + bool inner) { struct generic_userns_exec_data wrap; @@ -1657,6 +1714,7 @@ __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops, wrap.path = NULL; wrap.hierarchies = ops->hierarchies; wrap.conf = conf; + wrap.inner = inner; if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap, "chown_cgroup_wrapper") < 0) { @@ -2038,7 +2096,8 @@ __cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops) } __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops, - const char *controller) + const char *controller, + bool inner) { struct hierarchy *h; @@ -2049,6 +2108,9 @@ __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops, return NULL; } + if (inner) + return h->container_inner_path ? h->container_inner_path + strlen(h->mountpoint) : NULL; + return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL; } @@ -2080,7 +2142,7 @@ static int __cg_unified_attach(const struct hierarchy *h, const char *name, int fret = -1, idx = 0; char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL; - container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller); + container_cgroup = lxc_cmd_get_attach_cgroup_path(name, lxcpath, controller); /* not running */ if (!container_cgroup) return 0; @@ -2161,7 +2223,7 @@ __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name, continue; } - path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]); + path = lxc_cmd_get_attach_cgroup_path(name, lxcpath, h->controllers[0]); /* not running */ if (!path) continue; diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h index d4dcd506b..59445b5a5 100644 --- a/src/lxc/cgroups/cgroup.h +++ b/src/lxc/cgroups/cgroup.h @@ -32,6 +32,12 @@ #define MONITOR_CGROUP "lxc.monitor" #define PIVOT_CGROUP "lxc.pivot" +/* When lxc.cgroup.protect_limits is in effect the container's cgroup namespace + * will be moved into an additional subdirectory "cgns/" inside the cgroup in + * order to prevent it from accessing the outer limiting cgroup. + */ +#define CGROUP_NAMESPACE_SUBDIR "cgns" + struct lxc_handler; struct lxc_conf; struct lxc_list; @@ -72,6 +78,9 @@ typedef enum { * @monitor_full_path * - The full path to the monitor's cgroup. * + * @container_inner_path + * - The full path to the container's inner cgroup when protect_limits is used. + * * @version * - legacy hierarchy * If the hierarchy is a legacy hierarchy this will be set to @@ -85,6 +94,7 @@ struct hierarchy { char *mountpoint; char *container_base_path; char *container_full_path; + char *container_inner_path; char *monitor_full_path; int version; }; @@ -139,9 +149,9 @@ struct cgroup_ops { void (*monitor_destroy)(struct cgroup_ops *ops, struct lxc_handler *handler); bool (*monitor_create)(struct cgroup_ops *ops, struct lxc_handler *handler); bool (*monitor_enter)(struct cgroup_ops *ops, pid_t pid); - bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler); - bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid); - const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller); + bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler, bool inner); + bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid, bool inner); + const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller, bool inner); bool (*escape)(const struct cgroup_ops *ops, struct lxc_conf *conf); int (*num_hierarchies)(struct cgroup_ops *ops); bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out); @@ -152,7 +162,7 @@ struct cgroup_ops { bool (*unfreeze)(struct cgroup_ops *ops); bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_conf *conf, bool with_devices); - bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf); + bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf, bool inner); bool (*attach)(struct cgroup_ops *ops, const char *name, const char *lxcpath, pid_t pid); bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler, diff --git a/src/lxc/commands.c b/src/lxc/commands.c index 133384d72..b41a76000 100644 --- a/src/lxc/commands.c +++ b/src/lxc/commands.c @@ -427,20 +427,8 @@ static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req, return lxc_cmd_rsp_send(fd, &rsp); } -/* - * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a - * particular subsystem. This is the cgroup path relative to the root - * of the cgroup filesystem. - * - * @name : name of container to connect to - * @lxcpath : the lxcpath in which the container is running - * @subsystem : the subsystem being asked about - * - * Returns the path on success, NULL on failure. The caller must free() the - * returned path. - */ -char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath, - const char *subsystem) +char *do_lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath, + const char *subsystem, bool inner) { int ret, stopped; struct lxc_cmd_rr cmd = { @@ -453,8 +441,18 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath, cmd.req.data = subsystem; cmd.req.datalen = 0; - if (subsystem) - cmd.req.datalen = strlen(subsystem) + 1; + if (subsystem) { + size_t subsyslen = strlen(subsystem); + if (inner) { + char *data = alloca(subsyslen+2); + memcpy(data, subsystem, subsyslen+1); + data[subsyslen+1] = 1; + cmd.req.datalen = subsyslen+2, + cmd.req.data = data; + } else { + cmd.req.datalen = subsyslen+1; + } + } ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL); if (ret < 0) @@ -469,6 +467,42 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath, return cmd.rsp.data; } +/* + * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a + * particular subsystem. This is the cgroup path relative to the root + * of the cgroup filesystem. + * + * @name : name of container to connect to + * @lxcpath : the lxcpath in which the container is running + * @subsystem : the subsystem being asked about + * + * Returns the path on success, NULL on failure. The caller must free() the + * returned path. + */ +char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath, + const char *subsystem) +{ + return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, false); +} + +/* + * lxc_cmd_get_attach_cgroup_path: Calculate a container's inner cgroup path + * for a particular subsystem. This is the cgroup path relative to the root + * of the cgroup filesystem. + * + * @name : name of container to connect to + * @lxcpath : the lxcpath in which the container is running + * @subsystem : the subsystem being asked about + * + * Returns the path on success, NULL on failure. The caller must free() the + * returned path. + */ +char *lxc_cmd_get_attach_cgroup_path(const char *name, const char *lxcpath, + const char *subsystem) +{ + return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, true); +} + static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req, struct lxc_handler *handler) { @@ -476,10 +510,21 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req, struct lxc_cmd_rsp rsp; struct cgroup_ops *cgroup_ops = handler->cgroup_ops; - if (req->datalen > 0) - path = cgroup_ops->get_cgroup(cgroup_ops, req->data); - else - path = cgroup_ops->get_cgroup(cgroup_ops, NULL); + if (req->datalen > 0) { + const char *subsystem; + size_t subsyslen; + bool inner = false; + subsystem = req->data; + subsyslen = strlen(subsystem); + if (req->datalen == subsyslen+2) + inner = (subsystem[subsyslen+1] == 1); + + path = cgroup_ops->get_cgroup(cgroup_ops, req->data, inner); + } else { + // FIXME: cgroup separation for cgroup v2 cannot be handled + // like we used to do v1 here... need to figure this out... + path = cgroup_ops->get_cgroup(cgroup_ops, NULL, false); + } if (!path) return -1; @@ -651,7 +696,7 @@ static int lxc_cmd_stop_callback(int fd, struct lxc_cmd_req *req, * lxc_unfreeze() would do another cmd (GET_CGROUP) which would * deadlock us. */ - if (!cgroup_ops->get_cgroup(cgroup_ops, "freezer")) + if (!cgroup_ops->get_cgroup(cgroup_ops, "freezer", false)) return 0; if (cgroup_ops->unfreeze(cgroup_ops)) diff --git a/src/lxc/commands.h b/src/lxc/commands.h index 2c024b65d..7c4c00b1e 100644 --- a/src/lxc/commands.h +++ b/src/lxc/commands.h @@ -88,6 +88,8 @@ extern int lxc_cmd_console(const char *name, int *ttynum, int *fd, */ extern char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath, const char *subsystem); +extern char *lxc_cmd_get_attach_cgroup_path(const char *name, + const char *lxcpath, const char *subsystem); extern int lxc_cmd_get_clone_flags(const char *name, const char *lxcpath); extern char *lxc_cmd_get_config_item(const char *name, const char *item, const char *lxcpath); extern char *lxc_cmd_get_name(const char *hashed_sock); diff --git a/src/lxc/criu.c b/src/lxc/criu.c index 3d857b541..ec9bcb7e4 100644 --- a/src/lxc/criu.c +++ b/src/lxc/criu.c @@ -332,7 +332,7 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf, } else { const char *p; - p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]); + p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0], false); if (!p) { ERROR("failed to get cgroup path for %s", controllers[0]); goto err; @@ -976,7 +976,7 @@ static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_ goto out_fini_handler; handler->cgroup_ops = cgroup_ops; - if (!cgroup_ops->payload_create(cgroup_ops, handler)) { + if (!cgroup_ops->payload_create(cgroup_ops, handler, false)) { ERROR("failed creating groups"); goto out_fini_handler; } diff --git a/src/lxc/start.c b/src/lxc/start.c index dae3bcfe5..f3b29d6cd 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -1649,7 +1649,7 @@ static int lxc_spawn(struct lxc_handler *handler) } } - if (!cgroup_ops->payload_create(cgroup_ops, handler)) { + if (!cgroup_ops->payload_create(cgroup_ops, handler, false)) { ERROR("Failed creating cgroups"); goto out_delete_net; } @@ -1743,10 +1743,10 @@ static int lxc_spawn(struct lxc_handler *handler) goto out_delete_net; } - if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid)) + if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid, false)) goto out_delete_net; - if (!cgroup_ops->chown(cgroup_ops, handler->conf)) + if (!cgroup_ops->chown(cgroup_ops, handler->conf, false)) goto out_delete_net; /* Now we're ready to preserve the network namespace */ @@ -1813,16 +1813,30 @@ static int lxc_spawn(struct lxc_handler *handler) } } - ret = lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE); - if (ret < 0) - goto out_delete_net; - if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, true)) { ERROR("Failed to setup legacy device cgroup controller limits"); goto out_delete_net; } TRACE("Set up legacy device cgroup controller limits"); + if (cgns_supported()) { + if (!cgroup_ops->payload_create(cgroup_ops, handler, true)) { + ERROR("failed to create inner cgroup separation layer"); + goto out_delete_net; + } + if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid, true)) { + ERROR("failed to enter inner cgroup separation layer"); + goto out_delete_net; + } + if (!cgroup_ops->chown(cgroup_ops, handler->conf, true)) { + ERROR("failed chown inner cgroup separation layer"); + goto out_delete_net; + } + } + + if (lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE)) + goto out_delete_net; + if (handler->ns_clone_flags & CLONE_NEWCGROUP) { /* Now we're ready to preserve the cgroup namespace */ ret = lxc_try_preserve_ns(handler->pid, "cgroup"); -- 2.20.1