/* SPDX-License-Identifier: LGPL-2.1+ */ #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif #define __STDC_FORMAT_MACROS /* Required for PRIu64 to work. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Needs to be after sys/mount.h header */ #include #include #include #include #include #include #include #include "config.h" #include "log.h" #include "lsm/lsm.h" #include "lxclock.h" #include "memory_utils.h" #include "namespace.h" #include "parse.h" #include "process_utils.h" #include "syscall_wrappers.h" #include "utils.h" #if !HAVE_STRLCPY #include "strlcpy.h" #endif #if !HAVE_STRLCAT #include "strlcat.h" #endif #ifndef O_PATH #define O_PATH 010000000 #endif #ifndef O_NOFOLLOW #define O_NOFOLLOW 00400000 #endif lxc_log_define(utils, lxc); /* * if path is btrfs, tries to remove it and any subvolumes beneath it */ extern bool btrfs_try_remove_subvol(const char *path); static int _recursive_rmdir(const char *dirname, dev_t pdev, const char *exclude, int level, bool onedev) { __do_closedir DIR *dir = NULL; int failed = 0; bool hadexclude = false; int ret; struct dirent *direntp; char pathname[PATH_MAX]; dir = opendir(dirname); if (!dir) return log_error(-1, "Failed to open \"%s\"", dirname); while ((direntp = readdir(dir))) { int rc; struct stat mystat; if (strequal(direntp->d_name, ".") || strequal(direntp->d_name, "..")) continue; rc = strnprintf(pathname, sizeof(pathname), "%s/%s", dirname, direntp->d_name); if (rc < 0) { ERROR("The name of path is too long"); failed = 1; continue; } if (!level && exclude && strequal(direntp->d_name, exclude)) { ret = rmdir(pathname); if (ret < 0) { switch (errno) { case ENOTEMPTY: INFO("Not deleting snapshot \"%s\"", pathname); hadexclude = true; break; case ENOTDIR: ret = unlink(pathname); if (ret) INFO("Failed to remove \"%s\"", pathname); break; default: SYSERROR("Failed to rmdir \"%s\"", pathname); failed = 1; break; } } continue; } ret = lstat(pathname, &mystat); if (ret) { SYSERROR("Failed to stat \"%s\"", pathname); failed = 1; continue; } if (onedev && mystat.st_dev != pdev) { if (btrfs_try_remove_subvol(pathname)) INFO("Removed btrfs subvolume at \"%s\"", pathname); continue; } if (S_ISDIR(mystat.st_mode)) { if (_recursive_rmdir(pathname, pdev, exclude, level + 1, onedev) < 0) failed = 1; } else { ret = unlink(pathname); if (ret < 0) { __do_close int fd = -EBADF; fd = open(pathname, O_RDONLY | O_CLOEXEC | O_NONBLOCK); if (fd >= 0) { /* The file might be marked immutable. */ int attr = 0; ret = ioctl(fd, FS_IOC_GETFLAGS, &attr); if (ret < 0) SYSERROR("Failed to retrieve file flags"); attr &= ~FS_IMMUTABLE_FL; ret = ioctl(fd, FS_IOC_SETFLAGS, &attr); if (ret < 0) SYSERROR("Failed to set file flags"); } ret = unlink(pathname); if (ret < 0) { SYSERROR("Failed to delete \"%s\"", pathname); failed = 1; } } } } if (rmdir(dirname) < 0 && !btrfs_try_remove_subvol(dirname) && !hadexclude) { SYSERROR("Failed to delete \"%s\"", dirname); failed = 1; } return failed ? -1 : 0; } /* * In overlayfs, st_dev is unreliable. So on overlayfs we don't do the * lxc_rmdir_onedev(). */ static inline bool is_native_overlayfs(const char *path) { return has_fs_type(path, OVERLAY_SUPER_MAGIC) || has_fs_type(path, OVERLAYFS_SUPER_MAGIC); } /* returns 0 on success, -1 if there were any failures */ extern int lxc_rmdir_onedev(const char *path, const char *exclude) { struct stat mystat; bool onedev = true; if (is_native_overlayfs(path)) onedev = false; if (lstat(path, &mystat) < 0) { if (errno == ENOENT) return 0; return log_error_errno(-1, errno, "Failed to stat \"%s\"", path); } return _recursive_rmdir(path, mystat.st_dev, exclude, 0, onedev); } /* borrowed from iproute2 */ extern int get_u16(unsigned short *val, const char *arg, int base) { unsigned long res; char *ptr; if (!arg || !*arg) return ret_errno(EINVAL); errno = 0; res = strtoul(arg, &ptr, base); if (!ptr || ptr == arg || *ptr || res > 0xFFFF || errno != 0) return ret_errno(ERANGE); *val = res; return 0; } int mkdir_p(const char *dir, mode_t mode) { const char *tmp = dir; const char *orig = dir; do { __do_free char *makeme = NULL; int ret; dir = tmp + strspn(tmp, "/"); tmp = dir + strcspn(dir, "/"); makeme = strndup(orig, dir - orig); if (!makeme) return ret_set_errno(-1, ENOMEM); ret = mkdir(makeme, mode); if (ret < 0 && errno != EEXIST) return log_error_errno(-1, errno, "Failed to create directory \"%s\"", makeme); } while (tmp != dir); return 0; } char *get_rundir(void) { __do_free char *rundir = NULL; char *static_rundir; int ret; size_t len; const char *homedir; struct stat sb; if (stat(RUNTIME_PATH, &sb) < 0) return NULL; if (geteuid() == sb.st_uid || getegid() == sb.st_gid) return strdup(RUNTIME_PATH); static_rundir = getenv("XDG_RUNTIME_DIR"); if (static_rundir) return strdup(static_rundir); INFO("XDG_RUNTIME_DIR isn't set in the environment"); homedir = getenv("HOME"); if (!homedir) return log_error(NULL, "HOME isn't set in the environment"); len = strlen(homedir) + 17; rundir = malloc(sizeof(char) * len); if (!rundir) return NULL; ret = strnprintf(rundir, len, "%s/.cache/lxc/run/", homedir); if (ret < 0) return ret_set_errno(NULL, EIO); return move_ptr(rundir); } int wait_for_pid(pid_t pid) { int status, ret; again: ret = waitpid(pid, &status, 0); if (ret == -1) { if (errno == EINTR) goto again; return -1; } if (ret != pid) goto again; if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) return -1; return 0; } int wait_for_pidfd(int pidfd) { int ret; siginfo_t info = { .si_signo = 0, }; do { ret = waitid(P_PIDFD, pidfd, &info, __WALL | WEXITED); } while (ret < 0 && errno == EINTR); return !ret && WIFEXITED(info.si_status) && WEXITSTATUS(info.si_status) == 0; } int lxc_wait_for_pid_status(pid_t pid) { int status, ret; again: ret = waitpid(pid, &status, 0); if (ret == -1) { if (errno == EINTR) goto again; return -1; } if (ret != pid) goto again; return status; } #ifdef HAVE_OPENSSL #include static int do_sha1_hash(const char *buf, int buflen, unsigned char *md_value, unsigned int *md_len) { EVP_MD_CTX *mdctx; const EVP_MD *md; md = EVP_get_digestbyname("sha1"); if (!md) return log_error(-1, "Unknown message digest: sha1\n"); mdctx = EVP_MD_CTX_create(); EVP_DigestInit_ex(mdctx, md, NULL); EVP_DigestUpdate(mdctx, buf, buflen); EVP_DigestFinal_ex(mdctx, md_value, md_len); EVP_MD_CTX_destroy(mdctx); return 0; } int sha1sum_file(char *fnam, unsigned char *digest, unsigned int *md_len) { __do_free char *buf = NULL; __do_fclose FILE *f = NULL; int ret; ssize_t flen; ssize_t nbytes; if (!fnam) return -1; f = fopen_cloexec(fnam, "r"); if (!f) return log_error_errno(-1, errno, "Failed to open template \"%s\"", fnam); if (fseek(f, 0, SEEK_END) < 0) return log_error_errno(-1, errno, "Failed to seek to end of template"); flen = ftell(f); if (flen < 0) return log_error_errno(-1, errno, "Failed to tell size of template"); if (fseek(f, 0, SEEK_SET) < 0) return log_error_errno(-1, errno, "Failed to seek to start of template"); buf = malloc(flen + 1); if (!buf) return log_error_errno(-1, ENOMEM, "Out of memory"); nbytes = fread(buf, 1, flen, f); if (nbytes < 0 || nbytes != flen) return log_error_errno(-1, errno, "Failed to read template"); buf[flen] = '\0'; ret = do_sha1_hash(buf, flen, (void *)digest, md_len); return ret; } #endif struct lxc_popen_FILE *lxc_popen(const char *command) { int ret; int pipe_fds[2]; pid_t child_pid; struct lxc_popen_FILE *fp = NULL; ret = pipe2(pipe_fds, O_CLOEXEC); if (ret < 0) return NULL; child_pid = fork(); if (child_pid < 0) goto on_error; if (!child_pid) { sigset_t mask; close(pipe_fds[0]); /* duplicate stdout */ if (pipe_fds[1] != STDOUT_FILENO) ret = dup2(pipe_fds[1], STDOUT_FILENO); else ret = fcntl(pipe_fds[1], F_SETFD, 0); if (ret < 0) { close(pipe_fds[1]); _exit(EXIT_FAILURE); } /* duplicate stderr */ if (pipe_fds[1] != STDERR_FILENO) ret = dup2(pipe_fds[1], STDERR_FILENO); else ret = fcntl(pipe_fds[1], F_SETFD, 0); close(pipe_fds[1]); if (ret < 0) _exit(EXIT_FAILURE); /* unblock all signals */ ret = sigfillset(&mask); if (ret < 0) _exit(EXIT_FAILURE); ret = pthread_sigmask(SIG_UNBLOCK, &mask, NULL); if (ret < 0) _exit(EXIT_FAILURE); /* check if /bin/sh exist, otherwise try Android location /system/bin/sh */ if (file_exists("/bin/sh")) execl("/bin/sh", "sh", "-c", command, (char *)NULL); else execl("/system/bin/sh", "sh", "-c", command, (char *)NULL); _exit(127); } close(pipe_fds[1]); pipe_fds[1] = -1; fp = malloc(sizeof(*fp)); if (!fp) goto on_error; memset(fp, 0, sizeof(*fp)); fp->child_pid = child_pid; fp->pipe = pipe_fds[0]; /* From now on, closing fp->f will also close fp->pipe. So only ever * call fclose(fp->f). */ fp->f = fdopen(pipe_fds[0], "r"); if (!fp->f) goto on_error; return fp; on_error: /* We can only close pipe_fds[0] if fdopen() didn't succeed or wasn't * called yet. Otherwise the fd belongs to the file opened by fdopen() * since it isn't dup()ed. */ if (fp && !fp->f && pipe_fds[0] >= 0) close(pipe_fds[0]); if (pipe_fds[1] >= 0) close(pipe_fds[1]); if (fp && fp->f) fclose(fp->f); if (fp) free(fp); return NULL; } int lxc_pclose(struct lxc_popen_FILE *fp) { pid_t wait_pid; int wstatus = 0; if (!fp) return -1; do { wait_pid = waitpid(fp->child_pid, &wstatus, 0); } while (wait_pid < 0 && errno == EINTR); fclose(fp->f); free(fp); if (wait_pid < 0) return -1; return wstatus; } int randseed(bool srand_it) { __do_fclose FILE *f = NULL; /* * srand pre-seed function based on /dev/urandom */ unsigned int seed = time(NULL) + getpid(); f = fopen("/dev/urandom", "re"); if (f) { int ret = fread(&seed, sizeof(seed), 1, f); if (ret != 1) SYSDEBUG("Unable to fread /dev/urandom, fallback to time+pid rand seed"); } if (srand_it) srand(seed); return seed; } uid_t get_ns_uid(uid_t orig) { __do_free char *line = NULL; __do_fclose FILE *f = NULL; size_t sz = 0; uid_t nsid, hostid, range; f = fopen("/proc/self/uid_map", "re"); if (!f) return log_error_errno(0, errno, "Failed to open uid_map"); while (getline(&line, &sz, f) != -1) { if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3) continue; if (hostid <= orig && hostid + range > orig) return nsid += orig - hostid; } return LXC_INVALID_UID; } gid_t get_ns_gid(gid_t orig) { __do_free char *line = NULL; __do_fclose FILE *f = NULL; size_t sz = 0; gid_t nsid, hostid, range; f = fopen("/proc/self/gid_map", "re"); if (!f) return log_error_errno(0, errno, "Failed to open gid_map"); while (getline(&line, &sz, f) != -1) { if (sscanf(line, "%u %u %u", &nsid, &hostid, &range) != 3) continue; if (hostid <= orig && hostid + range > orig) return nsid += orig - hostid; } return LXC_INVALID_GID; } bool dir_exists(const char *path) { return exists_dir_at(-1, path); } /* Note we don't use SHA-1 here as we don't want to depend on HAVE_GNUTLS. * FNV has good anti collision properties and we're not worried * about pre-image resistance or one-way-ness, we're just trying to make * the name unique in the 108 bytes of space we have. */ uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval) { unsigned char *bp; for(bp = buf; bp < (unsigned char *)buf + len; bp++) { /* xor the bottom with the current octet */ hval ^= (uint64_t)*bp; /* gcc optimised: * multiply by the 64 bit FNV magic prime mod 2^64 */ hval += (hval << 1) + (hval << 4) + (hval << 5) + (hval << 7) + (hval << 8) + (hval << 40); } return hval; } bool is_shared_mountpoint(const char *path) { __do_fclose FILE *f = NULL; __do_free char *line = NULL; int i; size_t len = 0; f = fopen("/proc/self/mountinfo", "re"); if (!f) return 0; while (getline(&line, &len, f) > 0) { char *slider1, *slider2; for (slider1 = line, i = 0; slider1 && i < 4; i++) slider1 = strchr(slider1 + 1, ' '); if (!slider1) continue; slider2 = strchr(slider1 + 1, ' '); if (!slider2) continue; *slider2 = '\0'; if (strequal(slider1 + 1, path)) { /* This is the path. Is it shared? */ slider1 = strchr(slider2 + 1, ' '); if (slider1 && strstr(slider1, "shared:")) return true; } } return false; } /* * Detect whether / is mounted MS_SHARED. The only way I know of to * check that is through /proc/self/mountinfo. * I'm only checking for /. If the container rootfs or mount location * is MS_SHARED, but not '/', then you're out of luck - figuring that * out would be too much work to be worth it. */ int detect_shared_rootfs(void) { if (is_shared_mountpoint("/")) return 1; return 0; } bool switch_to_ns(pid_t pid, const char *ns) { __do_close int fd = -EBADF; int ret; char nspath[STRLITERALLEN("/proc//ns/") + INTTYPE_TO_STRLEN(pid_t) + LXC_NAMESPACE_NAME_MAX]; /* Switch to new ns */ ret = strnprintf(nspath, sizeof(nspath), "/proc/%d/ns/%s", pid, ns); if (ret < 0) return false; fd = open(nspath, O_RDONLY | O_CLOEXEC); if (fd < 0) return log_error_errno(false, errno, "Failed to open \"%s\"", nspath); ret = setns(fd, 0); if (ret) return log_error_errno(false, errno, "Failed to set process %d to \"%s\" of %d", pid, ns, fd); return true; } /* * looking at fs/proc_namespace.c, it appears we can * actually expect the rootfs entry to very specifically contain * " - rootfs rootfs " * IIUC, so long as we've chrooted so that rootfs is not our root, * the rootfs entry should always be skipped in mountinfo contents. */ bool detect_ramfs_rootfs(void) { __do_free char *line = NULL; __do_free void *fopen_cache = NULL; __do_fclose FILE *f = NULL; size_t len = 0; f = fopen_cached("/proc/self/mountinfo", "re", &fopen_cache); if (!f) return false; while (getline(&line, &len, f) != -1) { int i; char *p, *p2; for (p = line, i = 0; p && i < 4; i++) p = strchr(p + 1, ' '); if (!p) continue; p2 = strchr(p + 1, ' '); if (!p2) continue; *p2 = '\0'; if (strequal(p + 1, "/")) { /* This is '/'. Is it the ramfs? */ p = strchr(p2 + 1, '-'); if (p && strnequal(p, "- rootfs ", 9)) return true; } } return false; } char *on_path(const char *cmd, const char *rootfs) { __do_free char *path = NULL; char *entry = NULL; char cmdpath[PATH_MAX]; int ret; path = getenv("PATH"); if (!path) return NULL; path = strdup(path); if (!path) return NULL; lxc_iterate_parts(entry, path, ":") { if (rootfs) ret = strnprintf(cmdpath, sizeof(cmdpath), "%s/%s/%s", rootfs, entry, cmd); else ret = strnprintf(cmdpath, sizeof(cmdpath), "%s/%s", entry, cmd); if (ret < 0) continue; if (access(cmdpath, X_OK) == 0) return strdup(cmdpath); } return NULL; } /* historically lxc-init has been under /usr/lib/lxc and under * /usr/lib/$ARCH/lxc. It now lives as $prefix/sbin/init.lxc. */ char *choose_init(const char *rootfs) { char *retv = NULL; const char *empty = "", *tmp; int ret, env_set = 0; if (!getenv("PATH")) { if (setenv("PATH", "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", 0)) SYSERROR("Failed to setenv"); env_set = 1; } retv = on_path("init.lxc", rootfs); if (env_set) if (unsetenv("PATH")) SYSERROR("Failed to unsetenv"); if (retv) return retv; retv = malloc(PATH_MAX); if (!retv) return NULL; if (rootfs) tmp = rootfs; else tmp = empty; ret = strnprintf(retv, PATH_MAX, "%s/%s/%s", tmp, SBINDIR, "/init.lxc"); if (ret < 0) { ERROR("The name of path is too long"); goto out1; } if (access(retv, X_OK) == 0) return retv; ret = strnprintf(retv, PATH_MAX, "%s/%s/%s", tmp, LXCINITDIR, "/lxc/lxc-init"); if (ret < 0) { ERROR("The name of path is too long"); goto out1; } if (access(retv, X_OK) == 0) return retv; ret = strnprintf(retv, PATH_MAX, "%s/usr/lib/lxc/lxc-init", tmp); if (ret < 0) { ERROR("The name of path is too long"); goto out1; } if (access(retv, X_OK) == 0) return retv; ret = strnprintf(retv, PATH_MAX, "%s/sbin/lxc-init", tmp); if (ret < 0) { ERROR("The name of path is too long"); goto out1; } if (access(retv, X_OK) == 0) return retv; /* * Last resort, look for the statically compiled init.lxc which we * hopefully bind-mounted in. * If we are called during container setup, and we get to this point, * then the init.lxc.static from the host will need to be bind-mounted * in. So we return NULL here to indicate that. */ if (rootfs) goto out1; ret = strnprintf(retv, PATH_MAX, "/init.lxc.static"); if (ret < 0) { WARN("Nonsense - name /lxc.init.static too long"); goto out1; } if (access(retv, X_OK) == 0) return retv; out1: free(retv); return NULL; } /* * Given the '-t' template option to lxc-create, figure out what to * do. If the template is a full executable path, use that. If it * is something like 'sshd', then return $templatepath/lxc-sshd. * On success return the template, on error return NULL. */ char *get_template_path(const char *t) { int ret, len; char *tpath; if (t[0] == '/') { if (access(t, X_OK) == 0) { return strdup(t); } else { SYSERROR("Bad template pathname: %s", t); return NULL; } } len = strlen(LXCTEMPLATEDIR) + strlen(t) + strlen("/lxc-") + 1; tpath = malloc(len); if (!tpath) return NULL; ret = strnprintf(tpath, len, "%s/lxc-%s", LXCTEMPLATEDIR, t); if (ret < 0) { free(tpath); return NULL; } if (access(tpath, X_OK) < 0) { SYSERROR("bad template: %s", t); free(tpath); return NULL; } return tpath; } /* * @path: a pathname where / replaced with '\0'. * @offsetp: pointer to int showing which path segment was last seen. * Updated on return to reflect the next segment. * @fulllen: full original path length. * Returns a pointer to the next path segment, or NULL if done. */ static char *get_nextpath(char *path, int *offsetp, int fulllen) { int offset = *offsetp; if (offset >= fulllen) return NULL; while (offset < fulllen && path[offset] != '\0') offset++; while (offset < fulllen && path[offset] == '\0') offset++; *offsetp = offset; return (offset < fulllen) ? &path[offset] : NULL; } /* * Check that @subdir is a subdir of @dir. @len is the length of * @dir (to avoid having to recalculate it). */ static bool is_subdir(const char *subdir, const char *dir, size_t len) { size_t subdirlen = strlen(subdir); if (subdirlen < len) return false; if (!strnequal(subdir, dir, len)) return false; if (dir[len-1] == '/') return true; if (subdir[len] == '/' || subdirlen == len) return true; return false; } /* * Check if the open fd is a symlink. Return -ELOOP if it is. Return * -ENOENT if we couldn't fstat. Return 0 if the fd is ok. */ static int check_symlink(int fd) { struct stat sb; int ret; ret = fstat(fd, &sb); if (ret < 0) return -ENOENT; if (S_ISLNK(sb.st_mode)) return -ELOOP; return 0; } /* * Open a file or directory, provided that it contains no symlinks. * * CAVEAT: This function must not be used for other purposes than container * setup before executing the container's init */ static int open_if_safe(int dirfd, const char *nextpath) { int newfd = openat(dirfd, nextpath, O_RDONLY | O_NOFOLLOW); if (newfd >= 0) /* Was not a symlink, all good. */ return newfd; if (errno == ELOOP) return newfd; if (errno == EPERM || errno == EACCES) { /* We're not root (cause we got EPERM) so try opening with * O_PATH. */ newfd = openat(dirfd, nextpath, O_PATH | O_NOFOLLOW); if (newfd >= 0) { /* O_PATH will return an fd for symlinks. We know * nextpath wasn't a symlink at last openat, so if fd is * now a link, then something * fishy is going on. */ int ret = check_symlink(newfd); if (ret < 0) { close(newfd); newfd = ret; } } } return newfd; } /* * Open a path intending for mounting, ensuring that the final path * is inside the container's rootfs. * * CAVEAT: This function must not be used for other purposes than container * setup before executing the container's init * * @target: path to be opened * @prefix_skip: a part of @target in which to ignore symbolic links. This * would be the container's rootfs. * * Return an open fd for the path, or <0 on error. */ static int open_without_symlink(const char *target, const char *prefix_skip) { int curlen = 0, dirfd, fulllen, i; char *dup; fulllen = strlen(target); /* make sure prefix-skip makes sense */ if (prefix_skip && strlen(prefix_skip) > 0) { curlen = strlen(prefix_skip); if (!is_subdir(target, prefix_skip, curlen)) { ERROR("WHOA there - target \"%s\" didn't start with prefix \"%s\"", target, prefix_skip); return -EINVAL; } /* * get_nextpath() expects the curlen argument to be * on a (turned into \0) / or before it, so decrement * curlen to make sure that happens */ if (curlen) curlen--; } else { prefix_skip = "/"; curlen = 0; } /* Make a copy of target which we can hack up, and tokenize it */ if ((dup = strdup(target)) == NULL) { ERROR("Out of memory checking for symbolic link"); return -ENOMEM; } for (i = 0; i < fulllen; i++) { if (dup[i] == '/') dup[i] = '\0'; } dirfd = open(prefix_skip, O_RDONLY); if (dirfd < 0) { SYSERROR("Failed to open path \"%s\"", prefix_skip); goto out; } for (;;) { int newfd, saved_errno; char *nextpath; if ((nextpath = get_nextpath(dup, &curlen, fulllen)) == NULL) goto out; newfd = open_if_safe(dirfd, nextpath); saved_errno = errno; close(dirfd); dirfd = newfd; if (newfd < 0) { errno = saved_errno; if (errno == ELOOP) SYSERROR("%s in %s was a symbolic link!", nextpath, target); goto out; } } out: free(dup); return dirfd; } int __safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype, unsigned int flags, const void *data) { __do_close int source_fd = -EBADF, target_fd = -EBADF; struct lxc_open_how how = { .flags = PROTECT_OPATH_DIRECTORY, .resolve = PROTECT_LOOKUP_BENEATH_WITH_MAGICLINKS, }; int ret; char src_buf[LXC_PROC_PID_FD_LEN], tgt_buf[LXC_PROC_PID_FD_LEN]; if (beneath_fd < 0) return -EINVAL; if ((flags & MS_BIND) && src && src[0] != '/') { source_fd = openat2(beneath_fd, src, &how, sizeof(how)); if (source_fd < 0) return -errno; ret = strnprintf(src_buf, sizeof(src_buf), "/proc/self/fd/%d", source_fd); if (ret < 0) return -EIO; } else { src_buf[0] = '\0'; } target_fd = openat2(beneath_fd, dst, &how, sizeof(how)); if (target_fd < 0) return log_error_errno(-errno, errno, "Failed to open %d(%s)", beneath_fd, dst); ret = strnprintf(tgt_buf, sizeof(tgt_buf), "/proc/self/fd/%d", target_fd); if (ret < 0) return -EIO; if (!is_empty_string(src_buf)) ret = mount(src_buf, tgt_buf, fstype, flags, data); else ret = mount(src, tgt_buf, fstype, flags, data); return ret; } int safe_mount_beneath(const char *beneath, const char *src, const char *dst, const char *fstype, unsigned int flags, const void *data) { __do_close int beneath_fd = -EBADF; const char *path = beneath ? beneath : "/"; beneath_fd = openat(-1, path, PROTECT_OPATH_DIRECTORY); if (beneath_fd < 0) return log_error_errno(-errno, errno, "Failed to open %s", path); return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data); } int safe_mount_beneath_at(int beneath_fd, const char *src, const char *dst, const char *fstype, unsigned int flags, const void *data) { return __safe_mount_beneath_at(beneath_fd, src, dst, fstype, flags, data); } /* * Safely mount a path into a container, ensuring that the mount target * is under the container's @rootfs. (If @rootfs is NULL, then the container * uses the host's /) * * CAVEAT: This function must not be used for other purposes than container * setup before executing the container's init */ int safe_mount(const char *src, const char *dest, const char *fstype, unsigned long flags, const void *data, const char *rootfs) { int destfd, ret, saved_errno; /* Only needs enough for /proc/self/fd/. */ char srcbuf[50], destbuf[50]; int srcfd = -1; const char *mntsrc = src; if (!rootfs) rootfs = ""; /* todo - allow symlinks for relative paths if 'allowsymlinks' option is passed */ if (flags & MS_BIND && src && src[0] != '/') { INFO("This is a relative bind mount"); srcfd = open_without_symlink(src, NULL); if (srcfd < 0) return srcfd; ret = strnprintf(srcbuf, sizeof(srcbuf), "/proc/self/fd/%d", srcfd); if (ret < 0) { close(srcfd); ERROR("Out of memory"); return -EINVAL; } mntsrc = srcbuf; } destfd = open_without_symlink(dest, rootfs); if (destfd < 0) { if (srcfd != -1) { saved_errno = errno; close(srcfd); errno = saved_errno; } return destfd; } ret = strnprintf(destbuf, sizeof(destbuf), "/proc/self/fd/%d", destfd); if (ret < 0) { if (srcfd != -1) close(srcfd); close(destfd); ERROR("Out of memory"); return -EINVAL; } ret = mount(mntsrc, destbuf, fstype, flags, data); saved_errno = errno; if (srcfd != -1) close(srcfd); close(destfd); if (ret < 0) { errno = saved_errno; SYSERROR("Failed to mount \"%s\" onto \"%s\"", src ? src : "(null)", dest); return ret; } return 0; } int open_devnull(void) { int fd = open("/dev/null", O_RDWR); if (fd < 0) SYSERROR("Can't open /dev/null"); return fd; } int set_stdfds(int fd) { int ret; if (fd < 0) return -1; ret = dup2(fd, STDIN_FILENO); if (ret < 0) return -1; ret = dup2(fd, STDOUT_FILENO); if (ret < 0) return -1; ret = dup2(fd, STDERR_FILENO); if (ret < 0) return -1; return 0; } int null_stdfds(void) { int ret = -1; int fd; fd = open_devnull(); if (fd >= 0) { ret = set_stdfds(fd); close(fd); } return ret; } /* Check whether a signal is blocked by a process. */ /* /proc/pid-to-str/status\0 = (5 + 21 + 7 + 1) */ #define __PROC_STATUS_LEN (6 + INTTYPE_TO_STRLEN(pid_t) + 7 + 1) bool task_blocks_signal(pid_t pid, int signal) { __do_free char *line = NULL; __do_fclose FILE *f = NULL; int ret; char status[__PROC_STATUS_LEN] = {0}; uint64_t sigblk = 0, one = 1; size_t n = 0; bool bret = false; ret = strnprintf(status, sizeof(status), "/proc/%d/status", pid); if (ret < 0) return bret; f = fopen(status, "re"); if (!f) return false; while (getline(&line, &n, f) != -1) { char *numstr; if (!strnequal(line, "SigBlk:", 7)) continue; numstr = lxc_trim_whitespace_in_place(line + 7); ret = lxc_safe_uint64(numstr, &sigblk, 16); if (ret < 0) return false; break; } if (sigblk & (one << (signal - 1))) bret = true; return bret; } int lxc_preserve_ns(const int pid, const char *ns) { int ret; /* 5 /proc + 21 /int_as_str + 3 /ns + 20 /NS_NAME + 1 \0 */ #define __NS_PATH_LEN 50 char path[__NS_PATH_LEN]; /* This way we can use this function to also check whether namespaces * are supported by the kernel by passing in the NULL or the empty * string. */ ret = strnprintf(path, sizeof(path), "/proc/%d/ns%s%s", pid, !ns || strequal(ns, "") ? "" : "/", !ns || strequal(ns, "") ? "" : ns); if (ret < 0) return ret_errno(EIO); return open(path, O_RDONLY | O_CLOEXEC); } bool lxc_switch_uid_gid(uid_t uid, gid_t gid) { int ret = 0; if (gid != LXC_INVALID_GID) { ret = setresgid(gid, gid, gid); if (ret < 0) { SYSERROR("Failed to switch to gid %d", gid); return false; } NOTICE("Switched to gid %d", gid); } if (uid != LXC_INVALID_UID) { ret = setresuid(uid, uid, uid); if (ret < 0) { SYSERROR("Failed to switch to uid %d", uid); return false; } NOTICE("Switched to uid %d", uid); } return true; } /* Simple convenience function which enables uniform logging. */ bool lxc_drop_groups(void) { int ret; ret = setgroups(0, NULL); if (ret) return log_error_errno(false, errno, "Failed to drop supplimentary groups"); NOTICE("Dropped supplimentary groups"); return ret == 0; } bool lxc_setgroups(gid_t list[], size_t size) { int ret; ret = setgroups(size, list); if (ret) return log_error_errno(false, errno, "Failed to set supplimentary groups"); if (size > 0 && lxc_log_trace()) { for (size_t i = 0; i < size; i++) TRACE("Setting supplimentary group %d", list[i]); } NOTICE("Set supplimentary groups"); return true; } static int lxc_get_unused_loop_dev_legacy(char *loop_name) { struct dirent *dp; struct loop_info64 lo64; DIR *dir; int dfd = -1, fd = -1, ret = -1; dir = opendir("/dev"); if (!dir) { SYSERROR("Failed to open \"/dev\""); return -1; } while ((dp = readdir(dir))) { if (!strnequal(dp->d_name, "loop", 4)) continue; dfd = dirfd(dir); if (dfd < 0) continue; fd = openat(dfd, dp->d_name, O_RDWR); if (fd < 0) continue; ret = ioctl(fd, LOOP_GET_STATUS64, &lo64); if (ret < 0) { if (ioctl(fd, LOOP_GET_STATUS64, &lo64) == 0 || errno != ENXIO) { close(fd); fd = -1; continue; } } ret = strnprintf(loop_name, LO_NAME_SIZE, "/dev/%s", dp->d_name); if (ret < 0) { close(fd); fd = -1; continue; } break; } closedir(dir); if (fd < 0) return -1; return fd; } static int lxc_get_unused_loop_dev(char *name_loop) { int loop_nr, ret; int fd_ctl = -1, fd_tmp = -1; fd_ctl = open("/dev/loop-control", O_RDWR | O_CLOEXEC); if (fd_ctl < 0) { SYSERROR("Failed to open loop control"); return -ENODEV; } loop_nr = ioctl(fd_ctl, LOOP_CTL_GET_FREE); if (loop_nr < 0) { SYSERROR("Failed to get loop control"); goto on_error; } ret = strnprintf(name_loop, LO_NAME_SIZE, "/dev/loop%d", loop_nr); if (ret < 0) goto on_error; fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC); if (fd_tmp < 0) { /* on Android loop devices are moved under /dev/block, give it a shot */ ret = strnprintf(name_loop, LO_NAME_SIZE, "/dev/block/loop%d", loop_nr); if (ret < 0) goto on_error; fd_tmp = open(name_loop, O_RDWR | O_CLOEXEC); if (fd_tmp < 0) SYSERROR("Failed to open loop \"%s\"", name_loop); } on_error: close(fd_ctl); return fd_tmp; } int lxc_prepare_loop_dev(const char *source, char *loop_dev, int flags) { int ret; struct loop_info64 lo64; int fd_img = -1, fret = -1, fd_loop = -1; fd_loop = lxc_get_unused_loop_dev(loop_dev); if (fd_loop < 0) { if (fd_loop != -ENODEV) goto on_error; fd_loop = lxc_get_unused_loop_dev_legacy(loop_dev); if (fd_loop < 0) goto on_error; } fd_img = open(source, O_RDWR | O_CLOEXEC); if (fd_img < 0) { SYSERROR("Failed to open source \"%s\"", source); goto on_error; } ret = ioctl(fd_loop, LOOP_SET_FD, fd_img); if (ret < 0) { SYSERROR("Failed to set loop fd"); goto on_error; } memset(&lo64, 0, sizeof(lo64)); lo64.lo_flags = flags; strlcpy((char *)lo64.lo_file_name, source, LO_NAME_SIZE); ret = ioctl(fd_loop, LOOP_SET_STATUS64, &lo64); if (ret < 0) { SYSERROR("Failed to set loop status64"); goto on_error; } fret = 0; on_error: if (fd_img >= 0) close(fd_img); if (fret < 0 && fd_loop >= 0) { close(fd_loop); fd_loop = -1; } return fd_loop; } int lxc_unstack_mountpoint(const char *path, bool lazy) { int ret; int umounts = 0; pop_stack: ret = umount2(path, lazy ? MNT_DETACH : 0); if (ret < 0) { /* We consider anything else than EINVAL deadly to prevent going * into an infinite loop. (The other alternative is constantly * parsing /proc/self/mountinfo which is yucky and probably * racy.) */ if (errno != EINVAL) return -errno; } else { /* Just stop counting when this happens. That'd just be so * stupid that we won't even bother trying to report back the * correct value anymore. */ if (umounts != INT_MAX) umounts++; /* We succeeded in umounting. Make sure that there's no other * mountpoint stacked underneath. */ goto pop_stack; } return umounts; } static int run_command_internal(char *buf, size_t buf_size, int (*child_fn)(void *), void *args, bool wait_status) { pid_t child; int ret, fret, pipefd[2]; ssize_t bytes; /* Make sure our callers do not receive uninitialized memory. */ if (buf_size > 0 && buf) buf[0] = '\0'; if (pipe(pipefd) < 0) { SYSERROR("Failed to create pipe"); return -1; } child = lxc_raw_clone(0, NULL); if (child < 0) { close(pipefd[0]); close(pipefd[1]); SYSERROR("Failed to create new process"); return -1; } if (child == 0) { /* Close the read-end of the pipe. */ close(pipefd[0]); /* Redirect std{err,out} to write-end of the * pipe. */ ret = dup2(pipefd[1], STDOUT_FILENO); if (ret >= 0) ret = dup2(pipefd[1], STDERR_FILENO); /* Close the write-end of the pipe. */ close(pipefd[1]); if (ret < 0) { SYSERROR("Failed to duplicate std{err,out} file descriptor"); _exit(EXIT_FAILURE); } /* Does not return. */ child_fn(args); ERROR("Failed to exec command"); _exit(EXIT_FAILURE); } /* close the write-end of the pipe */ close(pipefd[1]); if (buf && buf_size > 0) { bytes = lxc_read_nointr(pipefd[0], buf, buf_size - 1); if (bytes > 0) buf[bytes - 1] = '\0'; } if (wait_status) fret = lxc_wait_for_pid_status(child); else fret = wait_for_pid(child); /* close the read-end of the pipe */ close(pipefd[0]); return fret; } int run_command(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) { return run_command_internal(buf, buf_size, child_fn, args, false); } int run_command_status(char *buf, size_t buf_size, int (*child_fn)(void *), void *args) { return run_command_internal(buf, buf_size, child_fn, args, true); } bool lxc_nic_exists(char *nic) { #define __LXC_SYS_CLASS_NET_LEN 15 + IFNAMSIZ + 1 char path[__LXC_SYS_CLASS_NET_LEN]; int ret; struct stat sb; if (strequal(nic, "none")) return true; ret = strnprintf(path, sizeof(path), "/sys/class/net/%s", nic); if (ret < 0) return false; ret = stat(path, &sb); if (ret < 0) return false; return true; } uint64_t lxc_find_next_power2(uint64_t n) { /* 0 is not valid input. We return 0 to the caller since 0 is not a * valid power of two. */ if (n == 0) return 0; if (!(n & (n - 1))) return n; while (n & (n - 1)) n = n & (n - 1); n = n << 1; return n; } static int process_dead(/* takes */ int status_fd) { __do_close int dupfd = -EBADF; __do_free char *line = NULL; __do_fclose FILE *f = NULL; int ret = 0; size_t n = 0; dupfd = dup(status_fd); if (dupfd < 0) return -1; if (fd_cloexec(dupfd, true) < 0) return -1; f = fdopen(dupfd, "re"); if (!f) return -1; /* Transfer ownership of fd. */ move_fd(dupfd); ret = 0; while (getline(&line, &n, f) != -1) { char *state; if (!strnequal(line, "State:", 6)) continue; state = lxc_trim_whitespace_in_place(line + 6); /* only check whether process is dead or zombie for now */ if (*state == 'X' || *state == 'Z') ret = 1; } return ret; } int lxc_set_death_signal(int signal, pid_t parent, int parent_status_fd) { int ret; pid_t ppid; ret = prctl(PR_SET_PDEATHSIG, prctl_arg(signal), prctl_arg(0), prctl_arg(0), prctl_arg(0)); /* verify that we haven't been orphaned in the meantime */ ppid = (pid_t)syscall(SYS_getppid); if (ppid == 0) { /* parent outside our pidns */ if (parent_status_fd < 0) return 0; if (process_dead(parent_status_fd) == 1) return raise(SIGKILL); } else if (ppid != parent) { return raise(SIGKILL); } if (ret < 0) return -1; return 0; } int lxc_rm_rf(const char *dirname) { __do_closedir DIR *dir = NULL; int fret = 0; int ret; struct dirent *direntp; dir = opendir(dirname); if (!dir) return log_error_errno(-1, errno, "Failed to open dir \"%s\"", dirname); while ((direntp = readdir(dir))) { __do_free char *pathname = NULL; struct stat mystat; if (strequal(direntp->d_name, ".") || strequal(direntp->d_name, "..")) continue; pathname = must_make_path(dirname, direntp->d_name, NULL); ret = lstat(pathname, &mystat); if (ret < 0) { if (!fret) SYSWARN("Failed to stat \"%s\"", pathname); fret = -1; continue; } if (!S_ISDIR(mystat.st_mode)) continue; ret = lxc_rm_rf(pathname); if (ret < 0) fret = -1; } ret = rmdir(dirname); if (ret < 0) return log_warn_errno(-1, errno, "Failed to delete \"%s\"", dirname); return fret; } bool lxc_can_use_pidfd(int pidfd) { int ret; if (pidfd < 0) return log_error(false, "Kernel does not support pidfds"); /* * We don't care whether or not children were in a waitable state. We * just care whether waitid() recognizes P_PIDFD. * * Btw, while I have your attention, the above waitid() code is an * excellent example of how _not_ to do flag-based kernel APIs. So if * you ever go into kernel development or are already and you add this * kind of flag potpourri even though you have read this comment shame * on you. May the gods of operating system development have mercy on * your soul because I won't. */ ret = waitid(P_PIDFD, pidfd, NULL, /* Type of children to wait for. */ __WALL | /* How to wait for them. */ WNOHANG | WNOWAIT | /* What state to wait for. */ WEXITED | WSTOPPED | WCONTINUED); if (ret < 0) return log_error_errno(false, errno, "Kernel does not support waiting on processes through pidfds"); ret = lxc_raw_pidfd_send_signal(pidfd, 0, NULL, 0); if (ret) return log_error_errno(false, errno, "Kernel does not support sending singals through pidfds"); return log_trace(true, "Kernel supports pidfds"); } int fix_stdio_permissions(uid_t uid) { __do_close int devnull_fd = -EBADF; int fret = 0; int std_fds[] = {STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO}; int ret; struct stat st, st_null; devnull_fd = open_devnull(); if (devnull_fd < 0) return log_trace_errno(-1, errno, "Failed to open \"/dev/null\""); ret = fstat(devnull_fd, &st_null); if (ret) return log_trace_errno(-errno, errno, "Failed to stat \"/dev/null\""); for (size_t i = 0; i < ARRAY_SIZE(std_fds); i++) { ret = fstat(std_fds[i], &st); if (ret) { SYSWARN("Failed to stat standard I/O file descriptor %d", std_fds[i]); fret = -1; continue; } if (st.st_rdev == st_null.st_rdev) continue; ret = fchown(std_fds[i], uid, st.st_gid); if (ret) { SYSTRACE("Failed to chown standard I/O file descriptor %d to uid %d and gid %d", std_fds[i], uid, st.st_gid); fret = -1; continue; } ret = fchmod(std_fds[i], 0700); if (ret) { SYSTRACE("Failed to chmod standard I/O file descriptor %d", std_fds[i]); fret = -1; } } return fret; } bool multiply_overflow(int64_t base, uint64_t mult, int64_t *res) { if (base > 0 && base > (int64_t)(INT64_MAX / mult)) return false; if (base < 0 && base < (int64_t)(INT64_MIN / mult)) return false; *res = (int64_t)(base * mult); return true; } int print_r(int fd, const char *path) { __do_close int dfd = -EBADF, dfd_dup = -EBADF; __do_closedir DIR *dir = NULL; int ret = 0; struct dirent *direntp; struct stat st; if (is_empty_string(path)) { char buf[LXC_PROC_SELF_FD_LEN]; ret = strnprintf(buf, sizeof(buf), "/proc/self/fd/%d", fd); if (ret < 0) return ret_errno(EIO); /* * O_PATH file descriptors can't be used so we need to re-open * just in case. */ dfd = openat(-EBADF, buf, O_CLOEXEC | O_DIRECTORY, 0); } else { dfd = openat(fd, path, O_CLOEXEC | O_DIRECTORY, 0); } if (dfd < 0) return -1; dfd_dup = dup_cloexec(dfd); if (dfd_dup < 0) return -1; dir = fdopendir(dfd); if (!dir) return -1; /* Transfer ownership to fdopendir(). */ move_fd(dfd); while ((direntp = readdir(dir))) { if (!strcmp(direntp->d_name, ".") || !strcmp(direntp->d_name, "..")) continue; ret = fstatat(dfd_dup, direntp->d_name, &st, AT_SYMLINK_NOFOLLOW); if (ret < 0 && errno != ENOENT) break; ret = 0; if (S_ISDIR(st.st_mode)) ret = print_r(dfd_dup, direntp->d_name); else INFO("mode(%o):uid(%d):gid(%d) -> %d/%s\n", (st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, dfd_dup, direntp->d_name); if (ret < 0 && errno != ENOENT) break; } if (is_empty_string(path)) ret = fstatat(fd, "", &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH); else ret = fstatat(fd, path, &st, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW); if (ret) return -1; else INFO("mode(%o):uid(%d):gid(%d) -> %s", (st.st_mode & ~S_IFMT), st.st_uid, st.st_gid, maybe_empty(path)); return ret; }