lxc/debian/patches/pve/0004-PVE-Up-separate-the-limiting-from-the-namespaced-cgr.patch
Wolfgang Bumiller 39994e5496 fix issues with shell detection on attach
Merge: attach: always use getent

Commit message:
In debian buster, some libnss plugins (if installed) can
cause getpwent to segfault instead of erroring out cleanly.
To avoid this, stick to always using getent.

Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
2019-08-13 14:41:05 +02:00

566 lines
19 KiB
Diff

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Wolfgang Bumiller <w.bumiller@proxmox.com>
Date: Wed, 28 Mar 2018 13:37:28 +0200
Subject: [PATCH] PVE: [Up] separate the limiting from the namespaced cgroup
root
When cgroup namespaces are enabled a privileged container
with mixed cgroups has full write access to its own root
cgroup effectively allowing it to overwrite values written
from the outside or configured via lxc.cgroup.*.
This patch causes an additional 'ns/' directory to be
created in all cgroups if cgroup namespaces and cgfsng are
being used in order to combat this.
Signed-off-by: Wolfgang Bumiller <w.bumiller@proxmox.com>
---
src/lxc/cgroups/cgfsng.c | 94 +++++++++++++++++++++++++++++++++-------
src/lxc/cgroups/cgroup.h | 18 ++++++--
src/lxc/commands.c | 87 ++++++++++++++++++++++++++++---------
src/lxc/commands.h | 2 +
src/lxc/criu.c | 4 +-
src/lxc/start.c | 28 +++++++++---
6 files changed, 183 insertions(+), 50 deletions(-)
diff --git a/src/lxc/cgroups/cgfsng.c b/src/lxc/cgroups/cgfsng.c
index ab99b47c5..ac8f469bb 100644
--- a/src/lxc/cgroups/cgfsng.c
+++ b/src/lxc/cgroups/cgfsng.c
@@ -818,6 +818,7 @@ static struct hierarchy *add_hierarchy(struct hierarchy ***h, char **clist, char
new->mountpoint = mountpoint;
new->container_base_path = container_base_path;
new->container_full_path = NULL;
+ new->container_inner_path = NULL;
new->monitor_full_path = NULL;
new->version = type;
@@ -1059,6 +1060,9 @@ static int cgroup_rmdir(struct hierarchy **hierarchies,
free(h->container_full_path);
h->container_full_path = NULL;
+
+ free(h->container_inner_path);
+ h->container_inner_path = NULL;
}
return 0;
@@ -1070,6 +1074,7 @@ struct generic_userns_exec_data {
struct lxc_conf *conf;
uid_t origuid; /* target uid in parent namespace */
char *path;
+ bool inner;
};
static int cgroup_rmdir_wrapper(void *data)
@@ -1112,6 +1117,7 @@ __cgfsng_ops static void cgfsng_payload_destroy(struct cgroup_ops *ops,
wrap.container_cgroup = ops->container_cgroup;
wrap.hierarchies = ops->hierarchies;
wrap.conf = handler->conf;
+ wrap.inner = false;
if (handler->conf && !lxc_list_empty(&handler->conf->id_map))
ret = userns_exec_1(handler->conf, cgroup_rmdir_wrapper, &wrap,
@@ -1323,17 +1329,26 @@ static bool monitor_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
return cg_unified_create_cgroup(h, cgname);
}
-static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname)
+static bool container_create_path_for_hierarchy(struct hierarchy *h, char *cgname, bool inner)
{
int ret;
+ char *path;
- if (!cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
+ if (!inner && !cg_legacy_handle_cpuset_hierarchy(h, cgname)) {
ERROR("Failed to handle legacy cpuset controller");
return false;
}
- h->container_full_path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
- ret = mkdir_eexist_on_last(h->container_full_path, 0755);
+ if (inner) {
+ path = must_make_path(h->container_full_path, CGROUP_NAMESPACE_SUBDIR, NULL);
+ h->container_inner_path = path;
+ ret = mkdir(path, 0755);
+ } else {
+ path = must_make_path(h->mountpoint, h->container_base_path, cgname, NULL);
+ h->container_full_path = path;
+ ret = mkdir_eexist_on_last(path, 0755);
+ }
+
if (ret < 0) {
ERROR("Failed to create cgroup \"%s\"", h->container_full_path);
return false;
@@ -1425,11 +1440,29 @@ on_error:
return bret;
}
+static inline bool cgfsng_create_inner(struct cgroup_ops *ops)
+{
+ size_t i;
+ bool ret = true;
+ char *cgname = must_make_path(ops->container_cgroup, CGROUP_NAMESPACE_SUBDIR, NULL);
+ for (i = 0; ops->hierarchies[i]; i++) {
+ if (!container_create_path_for_hierarchy(ops->hierarchies[i], cgname, true)) {
+ SYSERROR("Failed to create %s namespace subdirectory: %s",
+ ops->hierarchies[i]->container_full_path, strerror(errno));
+ ret = false;
+ break;
+ }
+ }
+ free(cgname);
+ return ret;
+}
+
/* Try to create the same cgroup in all hierarchies. Start with cgroup_pattern;
* next cgroup_pattern-1, -2, ..., -999.
*/
__cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
- struct lxc_handler *handler)
+ struct lxc_handler *handler,
+ bool inner)
{
int i;
size_t len;
@@ -1438,10 +1471,17 @@ __cgfsng_ops static inline bool cgfsng_payload_create(struct cgroup_ops *ops,
struct lxc_conf *conf = handler->conf;
if (ops->container_cgroup) {
+ if (inner)
+ return cgfsng_create_inner(ops);
WARN("cgfsng_create called a second time: %s", ops->container_cgroup);
return false;
}
+ if (inner) {
+ ERROR("cgfsng_create called twice for inner cgroup");
+ return false;
+ }
+
if (!conf)
return false;
@@ -1482,7 +1522,7 @@ again:
}
for (i = 0; ops->hierarchies[i]; i++) {
- if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup)) {
+ if (!container_create_path_for_hierarchy(ops->hierarchies[i], container_cgroup, false)) {
ERROR("Failed to create cgroup \"%s\"", ops->hierarchies[i]->container_full_path);
free(ops->hierarchies[i]->container_full_path);
ops->hierarchies[i]->container_full_path = NULL;
@@ -1505,7 +1545,8 @@ out_free:
}
__cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
- bool monitor)
+ bool monitor,
+ bool inner)
{
int len;
char pidstr[INTTYPE_TO_STRLEN(pid_t)];
@@ -1521,6 +1562,9 @@ __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
if (monitor)
path = must_make_path(ops->hierarchies[i]->monitor_full_path,
"cgroup.procs", NULL);
+ else if (inner)
+ path = must_make_path(ops->hierarchies[i]->container_inner_path,
+ "cgroup.procs", NULL);
else
path = must_make_path(ops->hierarchies[i]->container_full_path,
"cgroup.procs", NULL);
@@ -1538,12 +1582,12 @@ __cgfsng_ops static bool __do_cgroup_enter(struct cgroup_ops *ops, pid_t pid,
__cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops, pid_t pid)
{
- return __do_cgroup_enter(ops, pid, true);
+ return __do_cgroup_enter(ops, pid, true, false);
}
-static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid)
+static bool cgfsng_payload_enter(struct cgroup_ops *ops, pid_t pid, bool inner)
{
- return __do_cgroup_enter(ops, pid, false);
+ return __do_cgroup_enter(ops, pid, false, inner);
}
static int chowmod(char *path, uid_t chown_uid, gid_t chown_gid,
@@ -1609,9 +1653,15 @@ static int chown_cgroup_wrapper(void *data)
char *fullpath;
char *path = arg->hierarchies[i]->container_full_path;
+ if (arg->inner)
+ path = must_make_path(path, CGROUP_NAMESPACE_SUBDIR, NULL);
+
ret = chowmod(path, destuid, nsgid, 0775);
- if (ret < 0)
+ if (ret < 0) {
+ if (arg->inner)
+ free(path);
return -1;
+ }
/* Failures to chown() these are inconvenient but not
* detrimental We leave these owned by the container launcher,
@@ -1630,8 +1680,11 @@ static int chown_cgroup_wrapper(void *data)
(void)chowmod(fullpath, destuid, nsgid, 0664);
free(fullpath);
- if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC)
+ if (arg->hierarchies[i]->version != CGROUP2_SUPER_MAGIC) {
+ if (arg->inner)
+ free(path);
continue;
+ }
fullpath = must_make_path(path, "cgroup.subtree_control", NULL);
(void)chowmod(fullpath, destuid, nsgid, 0664);
@@ -1640,13 +1693,17 @@ static int chown_cgroup_wrapper(void *data)
fullpath = must_make_path(path, "cgroup.threads", NULL);
(void)chowmod(fullpath, destuid, nsgid, 0664);
free(fullpath);
+
+ if (arg->inner)
+ free(path);
}
return 0;
}
__cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
- struct lxc_conf *conf)
+ struct lxc_conf *conf,
+ bool inner)
{
struct generic_userns_exec_data wrap;
@@ -1657,6 +1714,7 @@ __cgfsng_ops static bool cgfsng_chown(struct cgroup_ops *ops,
wrap.path = NULL;
wrap.hierarchies = ops->hierarchies;
wrap.conf = conf;
+ wrap.inner = inner;
if (userns_exec_1(conf, chown_cgroup_wrapper, &wrap,
"chown_cgroup_wrapper") < 0) {
@@ -2038,7 +2096,8 @@ __cgfsng_ops static bool cgfsng_unfreeze(struct cgroup_ops *ops)
}
__cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
- const char *controller)
+ const char *controller,
+ bool inner)
{
struct hierarchy *h;
@@ -2049,6 +2108,9 @@ __cgfsng_ops static const char *cgfsng_get_cgroup(struct cgroup_ops *ops,
return NULL;
}
+ if (inner)
+ return h->container_inner_path ? h->container_inner_path + strlen(h->mountpoint) : NULL;
+
return h->container_full_path ? h->container_full_path + strlen(h->mountpoint) : NULL;
}
@@ -2080,7 +2142,7 @@ static int __cg_unified_attach(const struct hierarchy *h, const char *name,
int fret = -1, idx = 0;
char *base_path = NULL, *container_cgroup = NULL, *full_path = NULL;
- container_cgroup = lxc_cmd_get_cgroup_path(name, lxcpath, controller);
+ container_cgroup = lxc_cmd_get_attach_cgroup_path(name, lxcpath, controller);
/* not running */
if (!container_cgroup)
return 0;
@@ -2161,7 +2223,7 @@ __cgfsng_ops static bool cgfsng_attach(struct cgroup_ops *ops, const char *name,
continue;
}
- path = lxc_cmd_get_cgroup_path(name, lxcpath, h->controllers[0]);
+ path = lxc_cmd_get_attach_cgroup_path(name, lxcpath, h->controllers[0]);
/* not running */
if (!path)
continue;
diff --git a/src/lxc/cgroups/cgroup.h b/src/lxc/cgroups/cgroup.h
index d4dcd506b..59445b5a5 100644
--- a/src/lxc/cgroups/cgroup.h
+++ b/src/lxc/cgroups/cgroup.h
@@ -32,6 +32,12 @@
#define MONITOR_CGROUP "lxc.monitor"
#define PIVOT_CGROUP "lxc.pivot"
+/* When lxc.cgroup.protect_limits is in effect the container's cgroup namespace
+ * will be moved into an additional subdirectory "cgns/" inside the cgroup in
+ * order to prevent it from accessing the outer limiting cgroup.
+ */
+#define CGROUP_NAMESPACE_SUBDIR "cgns"
+
struct lxc_handler;
struct lxc_conf;
struct lxc_list;
@@ -72,6 +78,9 @@ typedef enum {
* @monitor_full_path
* - The full path to the monitor's cgroup.
*
+ * @container_inner_path
+ * - The full path to the container's inner cgroup when protect_limits is used.
+ *
* @version
* - legacy hierarchy
* If the hierarchy is a legacy hierarchy this will be set to
@@ -85,6 +94,7 @@ struct hierarchy {
char *mountpoint;
char *container_base_path;
char *container_full_path;
+ char *container_inner_path;
char *monitor_full_path;
int version;
};
@@ -139,9 +149,9 @@ struct cgroup_ops {
void (*monitor_destroy)(struct cgroup_ops *ops, struct lxc_handler *handler);
bool (*monitor_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
bool (*monitor_enter)(struct cgroup_ops *ops, pid_t pid);
- bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler);
- bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid);
- const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller);
+ bool (*payload_create)(struct cgroup_ops *ops, struct lxc_handler *handler, bool inner);
+ bool (*payload_enter)(struct cgroup_ops *ops, pid_t pid, bool inner);
+ const char *(*get_cgroup)(struct cgroup_ops *ops, const char *controller, bool inner);
bool (*escape)(const struct cgroup_ops *ops, struct lxc_conf *conf);
int (*num_hierarchies)(struct cgroup_ops *ops);
bool (*get_hierarchies)(struct cgroup_ops *ops, int n, char ***out);
@@ -152,7 +162,7 @@ struct cgroup_ops {
bool (*unfreeze)(struct cgroup_ops *ops);
bool (*setup_limits)(struct cgroup_ops *ops, struct lxc_conf *conf,
bool with_devices);
- bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf);
+ bool (*chown)(struct cgroup_ops *ops, struct lxc_conf *conf, bool inner);
bool (*attach)(struct cgroup_ops *ops, const char *name,
const char *lxcpath, pid_t pid);
bool (*mount)(struct cgroup_ops *ops, struct lxc_handler *handler,
diff --git a/src/lxc/commands.c b/src/lxc/commands.c
index 133384d72..b41a76000 100644
--- a/src/lxc/commands.c
+++ b/src/lxc/commands.c
@@ -427,20 +427,8 @@ static int lxc_cmd_get_clone_flags_callback(int fd, struct lxc_cmd_req *req,
return lxc_cmd_rsp_send(fd, &rsp);
}
-/*
- * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
- * particular subsystem. This is the cgroup path relative to the root
- * of the cgroup filesystem.
- *
- * @name : name of container to connect to
- * @lxcpath : the lxcpath in which the container is running
- * @subsystem : the subsystem being asked about
- *
- * Returns the path on success, NULL on failure. The caller must free() the
- * returned path.
- */
-char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
- const char *subsystem)
+char *do_lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
+ const char *subsystem, bool inner)
{
int ret, stopped;
struct lxc_cmd_rr cmd = {
@@ -453,8 +441,18 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
cmd.req.data = subsystem;
cmd.req.datalen = 0;
- if (subsystem)
- cmd.req.datalen = strlen(subsystem) + 1;
+ if (subsystem) {
+ size_t subsyslen = strlen(subsystem);
+ if (inner) {
+ char *data = alloca(subsyslen+2);
+ memcpy(data, subsystem, subsyslen+1);
+ data[subsyslen+1] = 1;
+ cmd.req.datalen = subsyslen+2,
+ cmd.req.data = data;
+ } else {
+ cmd.req.datalen = subsyslen+1;
+ }
+ }
ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL);
if (ret < 0)
@@ -469,6 +467,42 @@ char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
return cmd.rsp.data;
}
+/*
+ * lxc_cmd_get_cgroup_path: Calculate a container's cgroup path for a
+ * particular subsystem. This is the cgroup path relative to the root
+ * of the cgroup filesystem.
+ *
+ * @name : name of container to connect to
+ * @lxcpath : the lxcpath in which the container is running
+ * @subsystem : the subsystem being asked about
+ *
+ * Returns the path on success, NULL on failure. The caller must free() the
+ * returned path.
+ */
+char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
+ const char *subsystem)
+{
+ return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, false);
+}
+
+/*
+ * lxc_cmd_get_attach_cgroup_path: Calculate a container's inner cgroup path
+ * for a particular subsystem. This is the cgroup path relative to the root
+ * of the cgroup filesystem.
+ *
+ * @name : name of container to connect to
+ * @lxcpath : the lxcpath in which the container is running
+ * @subsystem : the subsystem being asked about
+ *
+ * Returns the path on success, NULL on failure. The caller must free() the
+ * returned path.
+ */
+char *lxc_cmd_get_attach_cgroup_path(const char *name, const char *lxcpath,
+ const char *subsystem)
+{
+ return do_lxc_cmd_get_cgroup_path(name, lxcpath, subsystem, true);
+}
+
static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
struct lxc_handler *handler)
{
@@ -476,10 +510,21 @@ static int lxc_cmd_get_cgroup_callback(int fd, struct lxc_cmd_req *req,
struct lxc_cmd_rsp rsp;
struct cgroup_ops *cgroup_ops = handler->cgroup_ops;
- if (req->datalen > 0)
- path = cgroup_ops->get_cgroup(cgroup_ops, req->data);
- else
- path = cgroup_ops->get_cgroup(cgroup_ops, NULL);
+ if (req->datalen > 0) {
+ const char *subsystem;
+ size_t subsyslen;
+ bool inner = false;
+ subsystem = req->data;
+ subsyslen = strlen(subsystem);
+ if (req->datalen == subsyslen+2)
+ inner = (subsystem[subsyslen+1] == 1);
+
+ path = cgroup_ops->get_cgroup(cgroup_ops, req->data, inner);
+ } else {
+ // FIXME: cgroup separation for cgroup v2 cannot be handled
+ // like we used to do v1 here... need to figure this out...
+ path = cgroup_ops->get_cgroup(cgroup_ops, NULL, false);
+ }
if (!path)
return -1;
@@ -651,7 +696,7 @@ static int lxc_cmd_stop_callback(int fd, struct lxc_cmd_req *req,
* lxc_unfreeze() would do another cmd (GET_CGROUP) which would
* deadlock us.
*/
- if (!cgroup_ops->get_cgroup(cgroup_ops, "freezer"))
+ if (!cgroup_ops->get_cgroup(cgroup_ops, "freezer", false))
return 0;
if (cgroup_ops->unfreeze(cgroup_ops))
diff --git a/src/lxc/commands.h b/src/lxc/commands.h
index 2c024b65d..7c4c00b1e 100644
--- a/src/lxc/commands.h
+++ b/src/lxc/commands.h
@@ -88,6 +88,8 @@ extern int lxc_cmd_console(const char *name, int *ttynum, int *fd,
*/
extern char *lxc_cmd_get_cgroup_path(const char *name, const char *lxcpath,
const char *subsystem);
+extern char *lxc_cmd_get_attach_cgroup_path(const char *name,
+ const char *lxcpath, const char *subsystem);
extern int lxc_cmd_get_clone_flags(const char *name, const char *lxcpath);
extern char *lxc_cmd_get_config_item(const char *name, const char *item, const char *lxcpath);
extern char *lxc_cmd_get_name(const char *hashed_sock);
diff --git a/src/lxc/criu.c b/src/lxc/criu.c
index 3d857b541..ec9bcb7e4 100644
--- a/src/lxc/criu.c
+++ b/src/lxc/criu.c
@@ -332,7 +332,7 @@ static void exec_criu(struct cgroup_ops *cgroup_ops, struct lxc_conf *conf,
} else {
const char *p;
- p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0]);
+ p = cgroup_ops->get_cgroup(cgroup_ops, controllers[0], false);
if (!p) {
ERROR("failed to get cgroup path for %s", controllers[0]);
goto err;
@@ -976,7 +976,7 @@ static void do_restore(struct lxc_container *c, int status_pipe, struct migrate_
goto out_fini_handler;
handler->cgroup_ops = cgroup_ops;
- if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
+ if (!cgroup_ops->payload_create(cgroup_ops, handler, false)) {
ERROR("failed creating groups");
goto out_fini_handler;
}
diff --git a/src/lxc/start.c b/src/lxc/start.c
index dae3bcfe5..f3b29d6cd 100644
--- a/src/lxc/start.c
+++ b/src/lxc/start.c
@@ -1649,7 +1649,7 @@ static int lxc_spawn(struct lxc_handler *handler)
}
}
- if (!cgroup_ops->payload_create(cgroup_ops, handler)) {
+ if (!cgroup_ops->payload_create(cgroup_ops, handler, false)) {
ERROR("Failed creating cgroups");
goto out_delete_net;
}
@@ -1743,10 +1743,10 @@ static int lxc_spawn(struct lxc_handler *handler)
goto out_delete_net;
}
- if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid))
+ if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid, false))
goto out_delete_net;
- if (!cgroup_ops->chown(cgroup_ops, handler->conf))
+ if (!cgroup_ops->chown(cgroup_ops, handler->conf, false))
goto out_delete_net;
/* Now we're ready to preserve the network namespace */
@@ -1813,16 +1813,30 @@ static int lxc_spawn(struct lxc_handler *handler)
}
}
- ret = lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE);
- if (ret < 0)
- goto out_delete_net;
-
if (!cgroup_ops->setup_limits(cgroup_ops, handler->conf, true)) {
ERROR("Failed to setup legacy device cgroup controller limits");
goto out_delete_net;
}
TRACE("Set up legacy device cgroup controller limits");
+ if (cgns_supported()) {
+ if (!cgroup_ops->payload_create(cgroup_ops, handler, true)) {
+ ERROR("failed to create inner cgroup separation layer");
+ goto out_delete_net;
+ }
+ if (!cgroup_ops->payload_enter(cgroup_ops, handler->pid, true)) {
+ ERROR("failed to enter inner cgroup separation layer");
+ goto out_delete_net;
+ }
+ if (!cgroup_ops->chown(cgroup_ops, handler->conf, true)) {
+ ERROR("failed chown inner cgroup separation layer");
+ goto out_delete_net;
+ }
+ }
+
+ if (lxc_sync_barrier_child(handler, LXC_SYNC_CGROUP_UNSHARE))
+ goto out_delete_net;
+
if (handler->ns_clone_flags & CLONE_NEWCGROUP) {
/* Now we're ready to preserve the cgroup namespace */
ret = lxc_try_preserve_ns(handler->pid, "cgroup");
--
2.20.1