diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c index e1511efdf..1f60266b7 100644 --- a/src/lxc/lxc_attach.c +++ b/src/lxc/lxc_attach.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include "attach.h" @@ -128,9 +129,9 @@ int main(int argc, char *argv[]) struct passwd *passwd; struct lxc_proc_context_info *init_ctx; struct lxc_handler *handler; - void *cgroup_data = NULL; uid_t uid; char *curdir; + int cgroup_ipc_sockets[2]; ret = lxc_caps_init(); if (ret) @@ -157,18 +158,6 @@ int main(int argc, char *argv[]) return -1; } - if (!elevated_privileges) { - /* we have to do this now since /sys/fs/cgroup may not - * be available inside the container or we may not have - * the required permissions anymore - */ - ret = lxc_cgroup_prepare_attach(my_args.name, &cgroup_data); - if (ret < 0) { - ERROR("failed to prepare attaching to cgroup"); - return -1; - } - } - curdir = getcwd(NULL, 0); /* determine which namespaces the container was created with @@ -184,6 +173,106 @@ int main(int argc, char *argv[]) } } + /* For the cgroup attaching logic to work in conjunction with pid and user namespaces, + * we need to have the following hierarchy: + * + * lxc-attach [process executed externally] + * | socketpair(cgroup_ipc_sockets) + * | fork() -> child + * | | setns() + * | | fork() -> grandchild + * | | | initialize + * | | | signal parent + * | |<------------------|----+ + * | | signal parent | + * |<----------------------|-----+ | + * | add to cgroups | | + * | signal child -------->| | + * | | signal child ---->| + * | waitpid() | waitpid() | exec() + * | |<------------------| exit() + * |<----------------------| exit() + * | exit() + * + * The rationale is the following: The first parent is needed because after + * setns() (mount + user namespace) we can't access the cgroup filesystem + * to add the pid to the corresponding cgroup. Therefore, we need to do that + * in a process executed on the host, so that's why we need to fork and wait + * for it to have done some initialization (cgroups may restrict certain + * operations so we have to do that in the end) and use IPC for signaling. + * + * Then in the child process we do the setns(). However, a process is never + * really attached to a pid namespace (never changes its pid, doesn't appear + * in the pid namespace /proc), only child processes of that process are + * truely inside the new pid namespace. That's why we need to fork() again + * after setns() before performing final initializations, then signal our + * parent, which signals the primary process, which does cgroup adding, + * which then signals to the grandchild that it can exec(). + */ + ret = socketpair(PF_LOCAL, SOCK_STREAM, 0, cgroup_ipc_sockets); + if (ret < 0) { + SYSERROR("could not set up required IPC mechanism for attaching"); + return -1; + } + + pid = fork(); + if (pid < 0) { + SYSERROR("failed to create first subprocess"); + return -1; + } + + if (pid) { + int status; + pid_t grandchild; + + close(cgroup_ipc_sockets[1]); + + gparent_reread: + ret = read(cgroup_ipc_sockets[0], &grandchild, sizeof(grandchild)); + if (ret <= 0) { + if (ret < 0 && (errno == EAGAIN || errno == EINTR)) + goto gparent_reread; + ERROR("failed to get pid of attached process to add to cgroup"); + return -1; + } + + if (!elevated_privileges) { + ret = lxc_cgroup_attach(my_args.name, grandchild); + if (ret < 0) { + ERROR("failed to attach process to cgroup"); + return -1; + } + } + + status = 0; + ret = write(cgroup_ipc_sockets[0], &status, sizeof(status)); + if (ret <= 0) { + ERROR("failed to signal child that cgroup logic has finished"); + return -1; + } + + close(cgroup_ipc_sockets[0]); + + gparent_again: + ret = waitpid(pid, &status, 0); + if (ret < 0) { + if (errno == EINTR) + goto gparent_again; + SYSERROR("failed to wait for process '%d'", pid); + return -1; + } + + if (WIFEXITED(status)) + return WEXITSTATUS(status); + + return -1; + } + + /* at this point we are in the 'parent' process so we need to close the + * socket reserved for the 'grandparent' process + */ + close(cgroup_ipc_sockets[0]); + /* we need to attach before we fork since certain namespaces * (such as pid namespaces) only really affect children of the * current process and not the process itself @@ -199,7 +288,10 @@ int main(int argc, char *argv[]) free(curdir); - /* hack: we need sync.h infrastructure - and that needs a handler */ + /* hack: we need sync.h infrastructure - and that needs a handler + * FIXME: perhaps we should also just use a very simple socketpair() + * here? - like with the grandparent <-> parent communication? + */ handler = calloc(1, sizeof(*handler)); if (lxc_sync_init(handler)) { @@ -225,23 +317,40 @@ int main(int argc, char *argv[]) if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE)) return -1; - /* now that we are done with all privileged operations, - * we can add ourselves to the cgroup. Since we smuggled in - * the fds earlier, we still have write permission + /* ask grandparent to add child to cgroups, the grandparent will + * itself check whether that's actually necessary */ - if (!elevated_privileges) { - /* since setns() for pid namespaces only really - * affects child processes, the pid we have is - * still valid outside the container, so this is - * fine - */ - ret = lxc_cgroup_finish_attach(cgroup_data, pid); - if (ret < 0) { - ERROR("failed to attach process to cgroup"); - return -1; - } + ret = write(cgroup_ipc_sockets[1], &pid, sizeof(pid)); + if (ret != sizeof(pid)) { + ERROR("error using IPC to notify main process of pid to add to the cgroups of the container"); + return -1; } + parent_reread: + /* we need some mechanism to check whether the grandparent could + * add us to the cgroups or not - so we await a dummy integer + * on the same socket (that's why we don't use a pipe - we need + * two-way communication). So if the parent fails and exits, that + * will close the socket, which will cause a read of 0 bytes for + * us, so we just terminate. If we read at least a byte, we don't + * care about the contents... + */ + ret = read(cgroup_ipc_sockets[1], &status, sizeof(status)); + if (ret <= 0) { + if (ret < 0 && (errno == EAGAIN || errno == EINTR)) + goto parent_reread; + /* only print someting if we can't assume the parent already + * gave an error message, that will reduce confusion for the + * user + */ + if (ret != 0) + ERROR("failed to get notification that the child process was added to the container's cgroups"); + return -1; + } + + /* we don't need that IPC interface anymore */ + close(cgroup_ipc_sockets[1]); + /* tell the child we are done initializing */ if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE)) return -1; @@ -264,7 +373,7 @@ int main(int argc, char *argv[]) if (!pid) { lxc_sync_fini_parent(handler); - lxc_cgroup_dispose_attach(cgroup_data); + close(cgroup_ipc_sockets[1]); if (attach_apparmor(init_ctx->aa_profile) < 0) { ERROR("failed switching apparmor profiles"); @@ -307,6 +416,19 @@ int main(int argc, char *argv[]) lxc_sync_fini(handler); + if (namespace_flags & CLONE_NEWUSER) { + /* XXX FIXME this should get the uid of the container init and setuid to that */ + /* XXX FIXME or perhaps try to map in the lxc-attach caller's uid? */ + if (setgid(0)) { + SYSERROR("switching to container gid"); + return -1; + } + if (setuid(0)) { + SYSERROR("switching to container uid"); + return -1; + } + } + if (my_args.argc) { execvp(my_args.argv[0], my_args.argv); SYSERROR("failed to exec '%s'", my_args.argv[0]);