diff --git a/src/lxc/cgroup.c b/src/lxc/cgroup.c index 69ba4e5d1..a02ebc2fa 100644 --- a/src/lxc/cgroup.c +++ b/src/lxc/cgroup.c @@ -254,13 +254,38 @@ static int cgroup_enable_clone_children(const char *path) return ret; } -static int lxc_one_cgroup_attach(const char *name, - struct mntent *mntent, pid_t pid) +static int lxc_one_cgroup_finish_attach(int fd, pid_t pid) { - FILE *f; + char buf[32]; + int ret; + + snprintf(buf, 32, "%ld", (long)pid); + + ret = write(fd, buf, strlen(buf)); + if (ret <= 0) { + SYSERROR("failed to write pid '%ld' to fd '%d'", (long)pid, fd); + ret = -1; + } else { + ret = 0; + } + + close(fd); + return ret; +} + +static int lxc_one_cgroup_dispose_attach(int fd) +{ + close(fd); + return 0; +} + +static int lxc_one_cgroup_prepare_attach(const char *name, + struct mntent *mntent) +{ + int fd; char tasks[MAXPATHLEN], initcgroup[MAXPATHLEN]; char *cgmnt = mntent->mnt_dir; - int flags, ret = 0; + int flags; int rc; flags = get_cgroup_flags(mntent); @@ -274,31 +299,83 @@ static int lxc_one_cgroup_attach(const char *name, return -1; } - f = fopen(tasks, "w"); - if (!f) { + fd = open(tasks, O_WRONLY); + if (fd < 0) { SYSERROR("failed to open '%s'", tasks); return -1; } - if (fprintf(f, "%d", pid) <= 0) { - SYSERROR("failed to write pid '%d' to '%s'", pid, tasks); - ret = -1; + return fd; +} + +static int lxc_one_cgroup_attach(const char *name, struct mntent *mntent, pid_t pid) +{ + int fd; + + fd = lxc_one_cgroup_prepare_attach(name, mntent); + if (fd < 0) { + return -1; } - fclose(f); + return lxc_one_cgroup_finish_attach(fd, pid); +} + +int lxc_cgroup_dispose_attach(void *data) +{ + int *fds = data; + int ret, err; + + if (!fds) { + return 0; + } + + ret = 0; + + for (; *fds >= 0; fds++) { + err = lxc_one_cgroup_dispose_attach(*fds); + if (err) { + ret = err; + } + } + + free(data); return ret; } -/* - * for each mounted cgroup, attach a pid to the cgroup for the container - */ -int lxc_cgroup_attach(const char *name, pid_t pid) +int lxc_cgroup_finish_attach(void *data, pid_t pid) +{ + int *fds = data; + int err; + + if (!fds) { + return 0; + } + + for (; *fds >= 0; fds++) { + err = lxc_one_cgroup_finish_attach(*fds, pid); + if (err) { + /* get rid of the rest of them */ + lxc_cgroup_dispose_attach(data); + return -1; + } + *fds = -1; + } + + free(data); + + return 0; +} + +int lxc_cgroup_prepare_attach(const char *name, void **data) { struct mntent *mntent; FILE *file = NULL; int err = -1; int found = 0; + int *fds; + int i; + static const int MAXFDS = 256; file = setmntent(MTAB, "r"); if (!file) { @@ -306,7 +383,29 @@ int lxc_cgroup_attach(const char *name, pid_t pid) return -1; } + /* create a large enough buffer for all practical + * use cases + */ + fds = malloc(sizeof(int) * MAXFDS); + if (!fds) { + err = -1; + goto out; + } + for (i = 0; i < MAXFDS; i++) { + fds[i] = -1; + } + + err = 0; + i = 0; while ((mntent = getmntent(file))) { + if (i >= MAXFDS - 1) { + ERROR("too many cgroups to attach to, aborting"); + lxc_cgroup_dispose_attach(fds); + errno = ENOMEM; + err = -1; + goto out; + } + DEBUG("checking '%s' (%s)", mntent->mnt_dir, mntent->mnt_type); if (strcmp(mntent->mnt_type, "cgroup")) @@ -317,19 +416,41 @@ int lxc_cgroup_attach(const char *name, pid_t pid) INFO("[%d] found cgroup mounted at '%s',opts='%s'", ++found, mntent->mnt_dir, mntent->mnt_opts); - err = lxc_one_cgroup_attach(name, mntent, pid); - if (err) + fds[i] = lxc_one_cgroup_prepare_attach(name, mntent); + if (fds[i] < 0) { + err = fds[i]; + lxc_cgroup_dispose_attach(fds); goto out; + } + i++; }; if (!found) ERROR("No cgroup mounted on the system"); + *data = fds; + out: endmntent(file); return err; } +/* + * for each mounted cgroup, attach a pid to the cgroup for the container + */ +int lxc_cgroup_attach(const char *name, pid_t pid) +{ + void *data = NULL; + int ret; + + ret = lxc_cgroup_prepare_attach(name, &data); + if (ret < 0) { + return ret; + } + + return lxc_cgroup_finish_attach(data, pid); +} + /* * rename cgname, which is under cgparent, to a new name starting * with 'cgparent/dead'. That way cgname can be reused. Return diff --git a/src/lxc/cgroup.h b/src/lxc/cgroup.h index 3c90696dc..8167f3920 100644 --- a/src/lxc/cgroup.h +++ b/src/lxc/cgroup.h @@ -31,5 +31,8 @@ extern int lxc_cgroup_destroy(const char *name); extern int lxc_cgroup_path_get(char **path, const char *subsystem, const char *name); extern int lxc_cgroup_nrtasks(const char *name); extern int lxc_cgroup_attach(const char *name, pid_t pid); +extern int lxc_cgroup_prepare_attach(const char *name, void **data); +extern int lxc_cgroup_finish_attach(void *data, pid_t pid); +extern int lxc_cgroup_dispose_attach(void *data); extern int lxc_ns_is_mounted(void); #endif diff --git a/src/lxc/lxc_attach.c b/src/lxc/lxc_attach.c index 955e9f445..e4f604ba3 100644 --- a/src/lxc/lxc_attach.c +++ b/src/lxc/lxc_attach.c @@ -96,6 +96,7 @@ int main(int argc, char *argv[]) struct passwd *passwd; struct lxc_proc_context_info *init_ctx; struct lxc_handler *handler; + void *cgroup_data = NULL; uid_t uid; char *curdir; @@ -124,6 +125,35 @@ int main(int argc, char *argv[]) return -1; } + if (!elevated_privileges) { + /* we have to do this now since /sys/fs/cgroup may not + * be available inside the container or we may not have + * the required permissions anymore + */ + ret = lxc_cgroup_prepare_attach(my_args.name, &cgroup_data); + if (ret < 0) { + ERROR("failed to prepare attaching to cgroup"); + return -1; + } + } + + curdir = get_current_dir_name(); + + /* we need to attach before we fork since certain namespaces + * (such as pid namespaces) only really affect children of the + * current process and not the process itself + */ + ret = lxc_attach_to_ns(init_pid); + if (ret < 0) { + ERROR("failed to enter the namespace"); + return -1; + } + + if (curdir && chdir(curdir)) + WARN("could not change directory to '%s'", curdir); + + free(curdir); + /* hack: we need sync.h infrastructure - and that needs a handler */ handler = calloc(1, sizeof(*handler)); @@ -150,8 +180,22 @@ int main(int argc, char *argv[]) if (lxc_sync_wait_child(handler, LXC_SYNC_CONFIGURE)) return -1; - if (!elevated_privileges && lxc_cgroup_attach(my_args.name, pid)) - return -1; + /* now that we are done with all privileged operations, + * we can add ourselves to the cgroup. Since we smuggled in + * the fds earlier, we still have write permission + */ + if (!elevated_privileges) { + /* since setns() for pid namespaces only really + * affects child processes, the pid we have is + * still valid outside the container, so this is + * fine + */ + ret = lxc_cgroup_finish_attach(cgroup_data, pid); + if (ret < 0) { + ERROR("failed to attach process to cgroup"); + return -1; + } + } /* tell the child we are done initializing */ if (lxc_sync_wake_child(handler, LXC_SYNC_POST_CONFIGURE)) @@ -175,19 +219,7 @@ int main(int argc, char *argv[]) if (!pid) { lxc_sync_fini_parent(handler); - - curdir = get_current_dir_name(); - - ret = lxc_attach_to_ns(init_pid); - if (ret < 0) { - ERROR("failed to enter the namespace"); - return -1; - } - - if (curdir && chdir(curdir)) - WARN("could not change directory to '%s'", curdir); - - free(curdir); + lxc_cgroup_dispose_attach(cgroup_data); if (new_personality < 0) new_personality = init_ctx->personality;