From 8deca6c986f1aaab084ab5bb6692c8a29068bba1 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 11 Dec 2017 12:55:23 +0100 Subject: [PATCH 1/2] start: intelligently use clone() on ns sharing When I first solved this problem I went for a fork() + setns() + clone() model. This works fine but has unnecessary overhead for a couple of reasons: - doing a full fork() including copying file descriptor table and virtual memory - using pipes to retrieve the pid of the second child (the actual container process) This can all be avoided by being a little smart in how we employ the clone() syscall: - using CLONE_VM will let us get rid of using pipes since we can simply write to the handler because we share the memory with our parent - using CLONE_VFORK will also let us get rid of using pipes since the execution of the parent is suspended until the child returns - using CLONE_VM will not cause virtual memory to be copied - using CLONE_FILES will not cause the file descriptor table to be copied Note that the intermediate clone() is used with CLONE_VM. Some glibc versions used to reset the pid/tid to -1 when CLONE_VM was used without CLONE_THREAD. But since the memory between parent and child is shared on CLONE_VM this would invalidate the getpid() cache that glibc used to maintain and so getpid() in the child would return the parent's pid. This is all fixed in newer glibc versions where the getpid() cache is removed and the pid/tid is not reset anymore. However, if for whatever reason you - dear commiter - somehow need to get the pid of the dummy intermediate process for do_share_ns() you need to call syscall(__NR_getpid) directly. The next lxc_clone() call does not employ CLONE_VM and will be fine. Signed-off-by: Christian Brauner --- src/lxc/start.c | 72 ++++++++++++++++++++++++------------------------- src/lxc/start.h | 6 +++++ 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/src/lxc/start.c b/src/lxc/start.c index b82768687..a8acfea17 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -1157,43 +1157,41 @@ void resolve_clone_flags(struct lxc_handler *handler) INFO("Inheriting pid namespace"); } -static pid_t lxc_fork_attach_clone(int (*fn)(void *), void *arg, int flags) +/* Note that this function is used with clone(CLONE_VM). Some glibc versions + * used to reset the pid/tid to -1 when CLONE_VM was used without CLONE_THREAD. + * But since the memory between parent and child is shared on CLONE_VM this + * would invalidate the getpid() cache that glibc used to maintain and so + * getpid() in the child would return the parent's pid. This is all fixed in + * newer glibc versions where the getpid() cache is removed and the pid/tid is + * not reset anymore. + * However, if for whatever reason you - dear commiter - somehow need to get the + * pid of the dummy intermediate process for do_share_ns() you need to call + * syscall(__NR_getpid) directly. The next lxc_clone() call does not employ + * CLONE_VM and will be fine. + */ +static inline int do_share_ns(void *arg) { - int i, r, ret; - pid_t pid, init_pid; + int i, flags, ret; struct lxc_handler *handler = arg; - pid = fork(); - if (pid < 0) - return -1; + for (i = 0; i < LXC_NS_MAX; i++) { + if (handler->nsfd[i] < 0) + continue; - if (!pid) { - for (i = 0; i < LXC_NS_MAX; i++) { - if (handler->nsfd[i] < 0) - continue; - - ret = setns(handler->nsfd[i], 0); - if (ret < 0) - exit(EXIT_FAILURE); - - DEBUG("Inherited %s namespace", ns_info[i].proc_name); - } - - init_pid = lxc_clone(do_start, handler, flags); - ret = lxc_write_nointr(handler->data_sock[1], &init_pid, - sizeof(init_pid)); + ret = setns(handler->nsfd[i], 0); if (ret < 0) - exit(EXIT_FAILURE); + return -1; - exit(EXIT_SUCCESS); + DEBUG("Inherited %s namespace", ns_info[i].proc_name); } - r = lxc_read_nointr(handler->data_sock[0], &init_pid, sizeof(init_pid)); - ret = wait_for_pid(pid); - if (ret < 0 || r < 0) + flags = handler->on_clone_flags; + flags |= CLONE_PARENT; + handler->pid = lxc_clone(do_start, handler, flags); + if (handler->pid < 0) return -1; - return init_pid; + return 0; } /* lxc_spawn() performs crucial setup tasks and clone()s the new process which @@ -1205,13 +1203,13 @@ static pid_t lxc_fork_attach_clone(int (*fn)(void *), void *arg, int flags) */ static int lxc_spawn(struct lxc_handler *handler) { - int i, flags, ret; + int i, ret; char pidstr[20]; bool wants_to_map_ids; struct lxc_list *id_map; const char *name = handler->name; const char *lxcpath = handler->lxcpath; - bool cgroups_connected = false, fork_before_clone = false; + bool cgroups_connected = false, share_ns = false; struct lxc_conf *conf = handler->conf; id_map = &conf->id_map; @@ -1225,7 +1223,7 @@ static int lxc_spawn(struct lxc_handler *handler) if (handler->nsfd[i] < 0) return -1; - fork_before_clone = true; + share_ns = true; } if (lxc_sync_init(handler)) @@ -1288,28 +1286,28 @@ static int lxc_spawn(struct lxc_handler *handler) } /* Create a process in a new set of namespaces. */ - flags = handler->clone_flags; + handler->on_clone_flags = handler->clone_flags; if (handler->clone_flags & CLONE_NEWUSER) { /* If CLONE_NEWUSER and CLONE_NEWNET was requested, we need to * clone a new user namespace first and only later unshare our * network namespace to ensure that network devices ownership is * set up correctly. */ - flags &= ~CLONE_NEWNET; + handler->on_clone_flags &= ~CLONE_NEWNET; } - if (fork_before_clone) - handler->pid = lxc_fork_attach_clone(do_start, handler, flags | CLONE_PARENT); + if (share_ns) + ret = lxc_clone(do_share_ns, handler, CLONE_VFORK | CLONE_VM | CLONE_FILES); else - handler->pid = lxc_clone(do_start, handler, flags); - if (handler->pid < 0) { + handler->pid = lxc_clone(do_start, handler, handler->on_clone_flags); + if (handler->pid < 0 || ret < 0) { SYSERROR("Failed to clone a new set of namespaces."); goto out_delete_net; } TRACE("Cloned child process %d", handler->pid); for (i = 0; i < LXC_NS_MAX; i++) - if (flags & ns_info[i].clone_flag) + if (handler->on_clone_flags & ns_info[i].clone_flag) INFO("Cloned %s", ns_info[i].flag_name); if (!preserve_ns(handler->nsfd, handler->clone_flags & ~CLONE_NEWNET, handler->pid)) { diff --git a/src/lxc/start.h b/src/lxc/start.h index e2b13141b..2161565d4 100644 --- a/src/lxc/start.h +++ b/src/lxc/start.h @@ -40,6 +40,12 @@ struct lxc_handler { /* The clone flags that were requested. */ int clone_flags; + /* The clone flags to actually use when calling lxc_clone(). They may + * differ from clone_flags because of ordering requirements (e.g. + * CLONE_NEWNET and CLONE_NEWUSER). + */ + int on_clone_flags; + /* File descriptor to pin the rootfs for privileged containers. */ int pinfd; From 7acb5ce30ddfa114e7fee62aaa5dfe84b8df1071 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 11 Dec 2017 14:47:24 +0100 Subject: [PATCH 2/2] tests: add namespace sharing tests This also ensures that the new more efficient clone() way of sharing namespaces is tested. Signed-off-by: Christian Brauner --- src/tests/Makefile.am | 6 +- src/tests/share_ns.c | 313 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 317 insertions(+), 2 deletions(-) create mode 100644 src/tests/share_ns.c diff --git a/src/tests/Makefile.am b/src/tests/Makefile.am index f223463d7..b38c93c67 100644 --- a/src/tests/Makefile.am +++ b/src/tests/Makefile.am @@ -31,6 +31,7 @@ lxc_test_config_jump_table_SOURCES = config_jump_table.c lxctest.h lxc_test_shortlived_SOURCES = shortlived.c lxc_test_livepatch_SOURCES = livepatch.c lxctest.h lxc_test_state_server_SOURCES = state_server.c lxctest.h +lxc_test_share_ns_SOURCES = share_ns.c lxctest.h AM_CFLAGS=-DLXCROOTFSMOUNT=\"$(LXCROOTFSMOUNT)\" \ -DLXCPATH=\"$(LXCPATH)\" \ @@ -60,7 +61,7 @@ bin_PROGRAMS = lxc-test-containertests lxc-test-locktests lxc-test-startone \ lxc-test-reboot lxc-test-list lxc-test-attach lxc-test-device-add-remove \ lxc-test-apparmor lxc-test-utils lxc-test-parse-config-file \ lxc-test-config-jump-table lxc-test-shortlived lxc-test-livepatch \ - lxc-test-api-reboot lxc-test-state-server + lxc-test-api-reboot lxc-test-state-server lxc-test-share-ns bin_SCRIPTS = lxc-test-automount \ lxc-test-autostart \ @@ -121,7 +122,8 @@ EXTRA_DIST = \ shutdowntest.c \ snapshot.c \ startone.c \ - state_server.c + state_server.c \ + share_ns.c clean-local: rm -f lxc-test-utils-* diff --git a/src/tests/share_ns.c b/src/tests/share_ns.c new file mode 100644 index 000000000..1b5a6b515 --- /dev/null +++ b/src/tests/share_ns.c @@ -0,0 +1,313 @@ +/* liblxcapi + * + * Copyright © 2017 Christian Brauner . + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lxc/lxccontainer.h" +#include "lxctest.h" + +struct thread_args { + int thread_id; + bool success; + pid_t init_pid; + char *inherited_ipc_ns; + char *inherited_net_ns; +}; + +void *ns_sharing_wrapper(void *data) +{ + int init_pid; + ssize_t ret; + char name[100]; + char owning_ns_init_pid[100]; + char proc_ns_path[4096]; + char ns_buf[4096]; + struct lxc_container *c; + struct thread_args *args = data; + + lxc_debug("Starting namespace sharing thread %d\n", args->thread_id); + + sprintf(name, "share-ns-%d", args->thread_id); + c = lxc_container_new(name, NULL); + if (!c) { + lxc_error("Failed to create container \"%s\"\n", name); + goto out; + } + + if (c->is_defined(c)) { + lxc_error("Container \"%s\" is defined\n", name); + goto out; + } + + if (!c->createl(c, "busybox", NULL, NULL, 0, NULL)) { + lxc_error("Failed to create busybox container \"%s\"\n", name); + goto out; + } + + if (!c->is_defined(c)) { + lxc_error("Container \"%s\" is not defined\n", name); + goto out; + } + + if (!c->load_config(c, NULL)) { + lxc_error("Failed to load config for container \"%s\"\n", name); + goto out; + } + + /* share ipc namespace by container name */ + if (!c->set_config_item(c, "lxc.namespace.ipc", "owning-ns")) { + lxc_error("Failed to set \"lxc.namespace.ipc=owning-ns\" for container \"%s\"\n", name); + goto out; + } + + /* clear all network configuration */ + if (!c->set_config_item(c, "lxc.net", "")) { + lxc_error("Failed to set \"lxc.namespace.ipc=owning-ns\" for container \"%s\"\n", name); + goto out; + } + + if (!c->set_config_item(c, "lxc.net.0.type", "empty")) { + lxc_error("Failed to set \"lxc.net.0.type=empty\" for container \"%s\"\n", name); + goto out; + } + + sprintf(owning_ns_init_pid, "%d", args->init_pid); + /* share net namespace by pid */ + if (!c->set_config_item(c, "lxc.namespace.net", owning_ns_init_pid)) { + lxc_error("Failed to set \"lxc.namespace.net=%s\" for container \"%s\"\n", owning_ns_init_pid, name); + goto out; + } + + if (!c->want_daemonize(c, true)) { + lxc_error("Failed to mark container \"%s\" daemonized\n", name); + goto out; + } + + if (!c->startl(c, 0, NULL)) { + lxc_error("Failed to start container \"%s\" daemonized\n", name); + goto out; + } + + init_pid = c->init_pid(c); + if (init_pid < 0) { + lxc_error("Failed to retrieve init pid of container \"%s\"\n", name); + goto out; + } + + /* Check whether we correctly inherited the ipc namespace. */ + ret = snprintf(proc_ns_path, sizeof(proc_ns_path), "/proc/%d/ns/ipc", init_pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_ns_path)) { + lxc_error("Failed to create string for container \"%s\"\n", name); + goto out; + } + + ret = readlink(proc_ns_path, ns_buf, sizeof(ns_buf)); + if (ret < 0 || (size_t)ret >= sizeof(ns_buf)) { + lxc_error("Failed to retrieve ipc namespace for container \"%s\"\n", name); + goto out; + } + ns_buf[ret] = '\0'; + + if (strcmp(args->inherited_ipc_ns, ns_buf) != 0) { + lxc_error("Failed to inherit ipc namespace from container \"owning-ns\": %s != %s\n", args->inherited_ipc_ns, ns_buf); + goto out; + } + lxc_debug("Inherited ipc namespace from container \"owning-ns\": %s == %s\n", args->inherited_ipc_ns, ns_buf); + + /* Check whether we correctly inherited the net namespace. */ + ret = snprintf(proc_ns_path, sizeof(proc_ns_path), "/proc/%d/ns/net", init_pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_ns_path)) { + lxc_error("Failed to create string for container \"%s\"\n", name); + goto out; + } + + ret = readlink(proc_ns_path, ns_buf, sizeof(ns_buf)); + if (ret < 0 || (size_t)ret >= sizeof(ns_buf)) { + lxc_error("Failed to retrieve ipc namespace for container \"%s\"\n", name); + goto out; + } + ns_buf[ret] = '\0'; + + if (strcmp(args->inherited_net_ns, ns_buf) != 0) { + lxc_error("Failed to inherit net namespace from container \"owning-ns\": %s != %s\n", args->inherited_net_ns, ns_buf); + goto out; + } + lxc_debug("Inherited net namespace from container \"owning-ns\": %s == %s\n", args->inherited_net_ns, ns_buf); + + args->success = true; + +out: + if (c->is_running(c) && !c->stop(c)) { + lxc_error("Failed to stop container \"%s\"\n", name); + goto out; + } + + if (!c->destroy(c)) { + lxc_error("Failed to destroy container \"%s\"\n", name); + goto out; + } + + pthread_exit(NULL); + return NULL; +} + +int main(int argc, char *argv[]) +{ + int i, init_pid, j; + char proc_ns_path[4096]; + char ipc_ns_buf[4096]; + char net_ns_buf[4096]; + pthread_attr_t attr; + pthread_t threads[10]; + struct thread_args args[10]; + struct lxc_container *c; + int ret = EXIT_FAILURE; + + c = lxc_container_new("owning-ns", NULL); + if (!c) { + lxc_error("%s", "Failed to create container \"owning-ns\""); + exit(ret); + } + + if (c->is_defined(c)) { + lxc_error("%s\n", "Container \"owning-ns\" is defined"); + goto on_error_put; + } + + if (!c->createl(c, "busybox", NULL, NULL, 0, NULL)) { + lxc_error("%s\n", "Failed to create busybox container \"owning-ns\""); + goto on_error_put; + } + + if (!c->is_defined(c)) { + lxc_error("%s\n", "Container \"owning-ns\" is not defined"); + goto on_error_put; + } + + c->clear_config(c); + + if (!c->load_config(c, NULL)) { + lxc_error("%s\n", "Failed to load config for container \"owning-ns\""); + goto on_error_stop; + } + + if (!c->want_daemonize(c, true)) { + lxc_error("%s\n", "Failed to mark container \"owning-ns\" daemonized"); + goto on_error_stop; + } + + if (!c->startl(c, 0, NULL)) { + lxc_error("%s\n", "Failed to start container \"owning-ns\" daemonized"); + goto on_error_stop; + } + + init_pid = c->init_pid(c); + if (init_pid < 0) { + lxc_error("%s\n", "Failed to retrieve init pid of container \"owning-ns\""); + goto on_error_stop; + } + + /* record our ipc namespace */ + ret = snprintf(proc_ns_path, sizeof(proc_ns_path), "/proc/%d/ns/ipc", init_pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_ns_path)) { + lxc_error("%s\n", "Failed to create string for container \"owning-ns\""); + goto on_error_stop; + } + + ret = readlink(proc_ns_path, ipc_ns_buf, sizeof(ipc_ns_buf)); + if (ret < 0 || (size_t)ret >= sizeof(ipc_ns_buf)) { + lxc_error("%s\n", "Failed to retrieve ipc namespace for container \"owning-ns\""); + goto on_error_stop; + + } + ipc_ns_buf[ret] = '\0'; + + /* record our net namespace */ + ret = snprintf(proc_ns_path, sizeof(proc_ns_path), "/proc/%d/ns/net", init_pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_ns_path)) { + lxc_error("%s\n", "Failed to create string for container \"owning-ns\""); + goto on_error_stop; + } + + ret = readlink(proc_ns_path, net_ns_buf, sizeof(net_ns_buf)); + if (ret < 0 || (size_t)ret >= sizeof(net_ns_buf)) { + lxc_error("%s\n", "Failed to retrieve ipc namespace for container \"owning-ns\""); + goto on_error_stop; + } + net_ns_buf[ret] = '\0'; + + sleep(5); + + pthread_attr_init(&attr); + + for (j = 0; j < 10; j++) { + lxc_debug("Starting namespace sharing test iteration %d\n", j); + + for (i = 0; i < 10; i++) { + int ret; + + args[i].thread_id = i; + args[i].success = false; + args[i].init_pid = init_pid; + args[i].inherited_ipc_ns = ipc_ns_buf; + args[i].inherited_net_ns = net_ns_buf; + + ret = pthread_create(&threads[i], &attr, ns_sharing_wrapper, (void *) &args[i]); + if (ret != 0) + goto on_error_stop; + } + + for (i = 0; i < 10; i++) { + int ret; + + ret = pthread_join(threads[i], NULL); + if (ret != 0) + goto on_error_stop; + + if (!args[i].success) { + lxc_error("ns sharing thread %d failed\n", args[i].thread_id); + goto on_error_stop; + } + } + } + + ret = EXIT_SUCCESS; + +on_error_stop: + if (c->is_running(c) && !c->stop(c)) + lxc_error("%s\n", "Failed to stop container \"owning-ns\""); + + if (!c->destroy(c)) + lxc_error("%s\n", "Failed to destroy container \"owning-ns\""); + +on_error_put: + lxc_container_put(c); + if (ret == EXIT_SUCCESS) + lxc_debug("%s\n", "All state namespace sharing tests passed"); + exit(ret); +}