diff --git a/src/lxc/attach.c b/src/lxc/attach.c index 733f3da36..437001079 100644 --- a/src/lxc/attach.c +++ b/src/lxc/attach.c @@ -59,6 +59,7 @@ #include "macro.h" #include "mainloop.h" #include "namespace.h" +#include "raw_syscalls.h" #include "terminal.h" #include "utils.h" diff --git a/src/lxc/namespace.c b/src/lxc/namespace.c index 8df173ff9..d6b4c2a76 100644 --- a/src/lxc/namespace.c +++ b/src/lxc/namespace.c @@ -75,80 +75,6 @@ pid_t lxc_clone(int (*fn)(void *), void *arg, int flags) return ret; } -/** - * This is based on raw_clone in systemd but adapted to our needs. This uses - * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and - * doesn't really matter to us so disallow it. - * - * The nice thing about this is that we get fork() behavior. That is - * lxc_raw_clone() returns 0 in the child and the child pid in the parent. - */ -pid_t lxc_raw_clone(unsigned long flags) -{ - - /* These flags don't interest at all so we don't jump through any hoopes - * of retrieving them and passing them to the kernel. - */ - errno = EINVAL; - if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | - CLONE_CHILD_CLEARTID | CLONE_SETTLS))) - return -EINVAL; - -#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) - /* On s390/s390x and cris the order of the first and second arguments - * of the system call is reversed. - */ - return (int)syscall(__NR_clone, NULL, flags | SIGCHLD); -#elif defined(__sparc__) && defined(__arch64__) - { - /** - * sparc64 always returns the other process id in %o0, and - * a boolean flag whether this is the child or the parent in - * %o1. Inline assembly is needed to get the flag returned - * in %o1. - */ - int in_child; - int child_pid; - asm volatile("mov %2, %%g1\n\t" - "mov %3, %%o0\n\t" - "mov 0 , %%o1\n\t" - "t 0x6d\n\t" - "mov %%o1, %0\n\t" - "mov %%o0, %1" - : "=r"(in_child), "=r"(child_pid) - : "i"(__NR_clone), "r"(flags | SIGCHLD) - : "%o1", "%o0", "%g1"); - - if (in_child) - return 0; - else - return child_pid; - } -#elif defined(__ia64__) - /* On ia64 the stack and stack size are passed as separate arguments. */ - return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, 0); -#else - return (int)syscall(__NR_clone, flags | SIGCHLD, NULL); -#endif -} - -pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags) -{ - pid_t pid; - - pid = lxc_raw_clone(flags); - if (pid < 0) - return -1; - - /* exit() is not thread-safe and might mess with the parent's signal - * handlers and other stuff when exec() fails. - */ - if (pid == 0) - _exit(fn(args)); - - return pid; -} - /* Leave the user namespace at the first position in the array of structs so * that we always attach to it first when iterating over the struct and using * setns() to switch namespaces. This especially affects lxc_attach(): Suppose diff --git a/src/lxc/namespace.h b/src/lxc/namespace.h index 8de2d5186..be2bf8b71 100644 --- a/src/lxc/namespace.h +++ b/src/lxc/namespace.h @@ -128,55 +128,13 @@ int clone(int (*fn)(void *), void *child_stack, * corresponding libc wrapper. glibc currently does not run pthread_atfork() * handlers but does not guarantee that they are not. Other libcs might or * might not run pthread_atfork() handlers. If you require guarantees please - * refer to the lxc_raw_clone*() functions below. + * refer to the lxc_raw_clone*() functions in raw_syscalls.{c,h}. * * - should call lxc_raw_getpid(): * The child should use lxc_raw_getpid() to retrieve its pid. */ extern pid_t lxc_clone(int (*fn)(void *), void *arg, int flags); -/** - * lxc_raw_clone() - create a new process - * - * - fork() behavior: - * This function returns 0 in the child and > 0 in the parent. - * - * - copy-on-write: - * This function does not allocate a new stack and relies on copy-on-write - * semantics. - * - * - supports subset of ClONE_* flags: - * lxc_raw_clone() intentionally only supports a subset of the flags available - * to the actual system call. Please refer to the implementation what flags - * cannot be used. Also, please don't assume that just because a flag isn't - * explicitly checked for as being unsupported that it is supported. If in - * doubt or not sufficiently familiar with process creation in the kernel and - * interactions with libcs this function should be used. - * - * - no pthread_atfork() handlers: - * This function circumvents - as much as this this is possible - any libc - * wrappers and thus does not run any pthread_atfork() handlers. Make sure - * that this is safe to do in the context you are trying to call this - * function. - * - * - must call lxc_raw_getpid(): - * The child must use lxc_raw_getpid() to retrieve its pid. - */ -extern pid_t lxc_raw_clone(unsigned long flags); -/** - * lxc_raw_clone_cb() - create a new process - * - * - non-fork() behavior: - * Function does return pid of the child or -1 on error. Pass in a callback - * function via the "fn" argument that gets executed in the child process. The - * "args" argument is passed to "fn". - * - * All other comments that apply to lxc_raw_clone() apply to lxc_raw_clone_cb() - * as well. - */ -extern pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, - unsigned long flags); - extern int lxc_namespace_2_cloneflag(const char *namespace); extern int lxc_namespace_2_ns_idx(const char *namespace); extern int lxc_namespace_2_std_identifiers(char *namespaces); diff --git a/src/lxc/raw_syscalls.c b/src/lxc/raw_syscalls.c index 5ce23eadf..045de9821 100644 --- a/src/lxc/raw_syscalls.c +++ b/src/lxc/raw_syscalls.c @@ -2,12 +2,16 @@ #define _GNU_SOURCE 1 #endif #include +#include +#include #include #include #include #include #include "config.h" +#include "macro.h" +#include "raw_syscalls.h" int lxc_raw_execveat(int dirfd, const char *pathname, char *const argv[], char *const envp[], int flags) @@ -19,3 +23,78 @@ int lxc_raw_execveat(int dirfd, const char *pathname, char *const argv[], return -1; #endif } + +/* + * This is based on raw_clone in systemd but adapted to our needs. This uses + * copy on write semantics and doesn't pass a stack. CLONE_VM is tricky and + * doesn't really matter to us so disallow it. + * + * The nice thing about this is that we get fork() behavior. That is + * lxc_raw_clone() returns 0 in the child and the child pid in the parent. + */ +pid_t lxc_raw_clone(unsigned long flags) +{ + + /* + * These flags don't interest at all so we don't jump through any hoopes + * of retrieving them and passing them to the kernel. + */ + errno = EINVAL; + if ((flags & (CLONE_VM | CLONE_PARENT_SETTID | CLONE_CHILD_SETTID | + CLONE_CHILD_CLEARTID | CLONE_SETTLS))) + return -EINVAL; + +#if defined(__s390x__) || defined(__s390__) || defined(__CRIS__) + /* On s390/s390x and cris the order of the first and second arguments + * of the system call is reversed. + */ + return (int)syscall(__NR_clone, NULL, flags | SIGCHLD); +#elif defined(__sparc__) && defined(__arch64__) + { + /* + * sparc64 always returns the other process id in %o0, and a + * boolean flag whether this is the child or the parent in %o1. + * Inline assembly is needed to get the flag returned in %o1. + */ + int in_child; + int child_pid; + asm volatile("mov %2, %%g1\n\t" + "mov %3, %%o0\n\t" + "mov 0 , %%o1\n\t" + "t 0x6d\n\t" + "mov %%o1, %0\n\t" + "mov %%o0, %1" + : "=r"(in_child), "=r"(child_pid) + : "i"(__NR_clone), "r"(flags | SIGCHLD) + : "%o1", "%o0", "%g1"); + + if (in_child) + return 0; + else + return child_pid; + } +#elif defined(__ia64__) + /* On ia64 the stack and stack size are passed as separate arguments. */ + return (int)syscall(__NR_clone, flags | SIGCHLD, NULL, prctl_arg(0)); +#else + return (int)syscall(__NR_clone, flags | SIGCHLD, NULL); +#endif +} + +pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags) +{ + pid_t pid; + + pid = lxc_raw_clone(flags); + if (pid < 0) + return -1; + + /* + * exit() is not thread-safe and might mess with the parent's signal + * handlers and other stuff when exec() fails. + */ + if (pid == 0) + _exit(fn(args)); + + return pid; +} diff --git a/src/lxc/raw_syscalls.h b/src/lxc/raw_syscalls.h index af953c29e..9ce0b7145 100644 --- a/src/lxc/raw_syscalls.h +++ b/src/lxc/raw_syscalls.h @@ -23,8 +23,52 @@ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif +#include #include #include +#include + +/* + * lxc_raw_clone() - create a new process + * + * - fork() behavior: + * This function returns 0 in the child and > 0 in the parent. + * + * - copy-on-write: + * This function does not allocate a new stack and relies on copy-on-write + * semantics. + * + * - supports subset of ClONE_* flags: + * lxc_raw_clone() intentionally only supports a subset of the flags available + * to the actual system call. Please refer to the implementation what flags + * cannot be used. Also, please don't assume that just because a flag isn't + * explicitly checked for as being unsupported that it is supported. If in + * doubt or not sufficiently familiar with process creation in the kernel and + * interactions with libcs this function should be used. + * + * - no pthread_atfork() handlers: + * This function circumvents - as much as this this is possible - any libc + * wrappers and thus does not run any pthread_atfork() handlers. Make sure + * that this is safe to do in the context you are trying to call this + * function. + * + * - must call lxc_raw_getpid(): + * The child must use lxc_raw_getpid() to retrieve its pid. + */ +extern pid_t lxc_raw_clone(unsigned long flags); + +/* + * lxc_raw_clone_cb() - create a new process + * + * - non-fork() behavior: + * Function does return pid of the child or -1 on error. Pass in a callback + * function via the "fn" argument that gets executed in the child process. + * The "args" argument is passed to "fn". + * + * All other comments that apply to lxc_raw_clone() apply to lxc_raw_clone_cb() + * as well. + */ +extern pid_t lxc_raw_clone_cb(int (*fn)(void *), void *args, unsigned long flags); extern int lxc_raw_execveat(int dirfd, const char *pathname, char *const argv[], char *const envp[], int flags); diff --git a/src/lxc/start.c b/src/lxc/start.c index 04aabd4c3..4f805525b 100644 --- a/src/lxc/start.c +++ b/src/lxc/start.c @@ -70,6 +70,7 @@ #include "monitor.h" #include "namespace.h" #include "network.h" +#include "raw_syscalls.h" #include "start.h" #include "storage/storage.h" #include "storage/storage_utils.h" diff --git a/src/lxc/utils.c b/src/lxc/utils.c index 8f79ca9ab..27f4578ba 100644 --- a/src/lxc/utils.c +++ b/src/lxc/utils.c @@ -51,6 +51,7 @@ #include "lxclock.h" #include "namespace.h" #include "parse.h" +#include "raw_syscalls.h" #include "syscall_wrappers.h" #include "utils.h" diff --git a/src/tests/lxc_raw_clone.c b/src/tests/lxc_raw_clone.c index ae38880a4..30d060ed9 100644 --- a/src/tests/lxc_raw_clone.c +++ b/src/tests/lxc_raw_clone.c @@ -38,7 +38,7 @@ #include #include "lxctest.h" -#include "namespace.h" +#include "raw_syscalls.h" #include "utils.h" int main(int argc, char *argv[])