mirror of
https://git.proxmox.com/git/mirror_lxc
synced 2025-04-28 16:37:44 +00:00
Merge pull request #4153 from brauner/2022-06-21.unprivileged-cgroup2
use systemd dbus StartTransientUnit for unpriv cgroup2
This commit is contained in:
commit
2e6e374c0a
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -26,7 +26,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -qq gcc clang meson llvm
|
||||
sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x
|
||||
sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x libsystemd-dev
|
||||
|
||||
- name: Compiler version
|
||||
env:
|
||||
|
2
.github/workflows/coverity.yml
vendored
2
.github/workflows/coverity.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -qq gcc clang
|
||||
sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev docbook2x
|
||||
sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev docbook2x libsystemd-dev
|
||||
|
||||
- name: Compiler version
|
||||
run: |
|
||||
|
2
.github/workflows/sanitizers.sh
vendored
2
.github/workflows/sanitizers.sh
vendored
@ -18,7 +18,7 @@ apt-get install --yes --no-install-recommends \
|
||||
libpam0g-dev libseccomp-dev libselinux1-dev libtool linux-libc-dev \
|
||||
llvm lsb-release make openssl pkg-config python3-all-dev \
|
||||
python3-setuptools rsync squashfs-tools uidmap unzip uuid-runtime \
|
||||
wget xz-utils systemd-coredump
|
||||
wget xz-utils systemd-coredump libsystemd-dev
|
||||
apt-get remove --yes lxc-utils liblxc-common liblxc1 liblxc-dev
|
||||
|
||||
ARGS="-Dprefix=/usr -Dtests=true -Dpam-cgroup=false -Dwerror=true -Dio-uring-event-loop=false -Db_lto_mode=default -Db_lundef=false"
|
||||
|
2
.github/workflows/sanitizers.yml
vendored
2
.github/workflows/sanitizers.yml
vendored
@ -22,7 +22,7 @@ jobs:
|
||||
run: |
|
||||
sudo apt-get update -qq
|
||||
sudo apt-get install -qq gcc clang meson llvm
|
||||
sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x
|
||||
sudo apt-get install -qq libapparmor-dev libcap-dev libseccomp-dev libselinux1-dev linux-libc-dev libpam0g-dev docbook2x libsystemd-dev
|
||||
|
||||
- name: Compiler version
|
||||
env:
|
||||
|
51
meson.build
51
meson.build
@ -151,6 +151,7 @@ want_oss_fuzz = get_option('oss-fuzz')
|
||||
want_seccomp = get_option('seccomp')
|
||||
want_thread_safety = get_option('thread-safety')
|
||||
want_memfd_rexec = get_option('memfd-rexec')
|
||||
want_sd_bus = get_option('sd-bus')
|
||||
|
||||
srcconf.set_quoted('DEFAULT_CGROUP_PATTERN', cgrouppattern)
|
||||
if coverity
|
||||
@ -256,6 +257,49 @@ else
|
||||
srcconf.set10('HAVE_LIBURING', false)
|
||||
endif
|
||||
|
||||
if not want_sd_bus.disabled()
|
||||
has_sd_bus = true
|
||||
sd_bus_optional = want_sd_bus.auto()
|
||||
|
||||
libsystemd = dependency('libsystemd', required: not sd_bus_optional)
|
||||
if not libsystemd.found()
|
||||
if not sd_bus_optional
|
||||
error('missing required libsystemd dependency')
|
||||
endif
|
||||
|
||||
has_sd_bus = false
|
||||
endif
|
||||
|
||||
if not cc.has_header('systemd/sd-bus.h')
|
||||
if not sd_bus_optional
|
||||
error('libsystemd misses required systemd/sd-bus.h header')
|
||||
endif
|
||||
|
||||
has_sd_bus = false
|
||||
endif
|
||||
|
||||
if not cc.has_header('systemd/sd-event.h')
|
||||
if not sd_bus_optional
|
||||
error('libsystemd misses required systemd/sd-event.h header')
|
||||
endif
|
||||
|
||||
has_sd_bus = false
|
||||
endif
|
||||
|
||||
if not cc.has_function('sd_bus_call_method_asyncv', prefix: '#include <systemd/sd-bus.h>', dependencies: libsystemd)
|
||||
if not sd_bus_optional
|
||||
error('libsystemd misses required sd_bus_call_method_asyncv function')
|
||||
endif
|
||||
|
||||
has_sd_bus = false
|
||||
endif
|
||||
|
||||
srcconf.set10('HAVE_LIBSYSTEMD', has_sd_bus)
|
||||
else
|
||||
has_sd_bus = false
|
||||
srcconf.set10('HAVE_LIBSYSTEMD', false)
|
||||
endif
|
||||
|
||||
## Time EPOCH.
|
||||
sh = find_program('sh')
|
||||
date = find_program('date')
|
||||
@ -639,6 +683,8 @@ endforeach
|
||||
found_headers = []
|
||||
missing_headers = []
|
||||
foreach tuple: [
|
||||
['systemd/sd-bus.h'],
|
||||
['systemd/sd-event.h'],
|
||||
['sys/resource.h'],
|
||||
['sys/memfd.h'],
|
||||
['sys/personality.h'],
|
||||
@ -676,6 +722,7 @@ foreach tuple: [
|
||||
['pam'],
|
||||
['openssl'],
|
||||
['liburing'],
|
||||
['libsystemd'],
|
||||
]
|
||||
|
||||
if tuple.length() >= 2
|
||||
@ -750,6 +797,10 @@ if want_io_uring
|
||||
liblxc_dependencies += [liburing]
|
||||
endif
|
||||
|
||||
if has_sd_bus
|
||||
liblxc_dependencies += [libsystemd]
|
||||
endif
|
||||
|
||||
liblxc_link_whole = [liblxc_static]
|
||||
|
||||
liblxc = shared_library(
|
||||
|
@ -22,6 +22,9 @@ option('init-script', type : 'array',
|
||||
option('io-uring-event-loop', type: 'boolean', value: 'false',
|
||||
description: 'Enable io-uring based event loop')
|
||||
|
||||
option('sd-bus', type: 'feature', value: 'auto',
|
||||
description: 'Enable linking against sd-bus')
|
||||
|
||||
# was --{disable,enable}-doc in autotools
|
||||
option('man', type: 'boolean', value: 'true',
|
||||
description: 'build and install manpages')
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <grp.h>
|
||||
#include <linux/kdev_t.h>
|
||||
#include <linux/types.h>
|
||||
#include <libgen.h>
|
||||
#include <poll.h>
|
||||
#include <signal.h>
|
||||
#include <stdint.h>
|
||||
@ -57,6 +58,11 @@
|
||||
#include "strlcat.h"
|
||||
#endif
|
||||
|
||||
#if HAVE_LIBSYSTEMD
|
||||
#include <systemd/sd-bus.h>
|
||||
#include <systemd/sd-event.h>
|
||||
#endif
|
||||
|
||||
lxc_log_define(cgfsng, cgroup);
|
||||
|
||||
/*
|
||||
@ -947,6 +953,354 @@ static bool check_cgroup_dir_config(struct lxc_conf *conf)
|
||||
return true;
|
||||
}
|
||||
|
||||
#define SYSTEMD_SCOPE_FAILED 2
|
||||
#define SYSTEMD_SCOPE_UNSUPP 1
|
||||
#define SYSTEMD_SCOPE_SUCCESS 0
|
||||
|
||||
#if HAVE_LIBSYSTEMD
|
||||
struct sd_callback_data {
|
||||
char *scope_name;
|
||||
bool job_complete;
|
||||
};
|
||||
|
||||
static int systemd_jobremoved_callback(sd_bus_message *m, void *userdata, sd_bus_error *error)
|
||||
{
|
||||
char *path, *unit, *result;
|
||||
struct sd_callback_data *sd_data = userdata;
|
||||
uint32_t id;
|
||||
int r;
|
||||
|
||||
r = sd_bus_message_read(m, "uoss", &id, &path, &unit, &result);
|
||||
if (r < 0)
|
||||
return log_error(-1, "bad message received in callback: %s", strerror(-r));
|
||||
|
||||
if (sd_data->scope_name && strcmp(unit, sd_data->scope_name) != 0)
|
||||
return log_trace(-1, "unit was '%s' not '%s'", unit, sd_data->scope_name);
|
||||
if (strcmp(result, "done") == 0) {
|
||||
sd_data->job_complete = true;
|
||||
return log_info(1, "job is done");
|
||||
}
|
||||
return log_debug(0, "result was '%s', not 'done'", result);
|
||||
}
|
||||
|
||||
#define DESTINATION "org.freedesktop.systemd1"
|
||||
#define PATH "/org/freedesktop/systemd1"
|
||||
#define INTERFACE "org.freedesktop.systemd1.Manager"
|
||||
#define MEMBER "StartTransientUnit"
|
||||
static bool start_scope(sd_bus *bus, struct sd_callback_data *data, struct sd_event *event)
|
||||
{
|
||||
__attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;;
|
||||
__attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL;
|
||||
__attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL;
|
||||
char *path = NULL;
|
||||
int r;
|
||||
|
||||
r = sd_bus_message_new_method_call(bus, &m,
|
||||
DESTINATION, PATH, INTERFACE, MEMBER);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed creating sdbus message");
|
||||
|
||||
r = sd_bus_message_append(m, "ss", data->scope_name, "fail");
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed setting systemd scope name");
|
||||
|
||||
r = sd_bus_message_open_container(m, 'a', "(sv)");
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed allocating sdbus msg properties");
|
||||
|
||||
r = sd_bus_message_append(m, "(sv)(sv)(sv)",
|
||||
"PIDs", "au", 1, getpid(),
|
||||
"Delegate", "b", 1,
|
||||
"CollectMode", "s", "inactive-or-failed");
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed setting properties on sdbus message");
|
||||
|
||||
r = sd_bus_message_close_container(m);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed closing sdbus message properties");
|
||||
|
||||
r = sd_bus_message_append(m, "a(sa(sv))", 0);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed appending aux boilerplate\n");
|
||||
|
||||
r = sd_bus_call(NULL, m, 0, &error, &reply);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed sending sdbus message: %s", error.message);
|
||||
|
||||
/* Parse the response message */
|
||||
r = sd_bus_message_read(reply, "o", &path);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed to parse response message: %s", strerror(-r));
|
||||
|
||||
/* Now spin up a mini-event-loop to wait for the "job completed" message */
|
||||
int tries = 0;
|
||||
|
||||
while (!data->job_complete) {
|
||||
r = sd_event_run(event, 1000 * 1000);
|
||||
if (r < 0) {
|
||||
log_debug(stderr, "Error waiting for JobRemoved: %s\n", strerror(-r));
|
||||
continue;
|
||||
}
|
||||
if (data->job_complete || tries == 5)
|
||||
break;
|
||||
if (r > 0) {
|
||||
log_trace(stderr, "Debug: we processed an event (%d), but not the one we wanted\n", r);
|
||||
continue;
|
||||
}
|
||||
if (r == 0) // timeout
|
||||
tries++;
|
||||
}
|
||||
if (!data->job_complete) {
|
||||
return log_error(false, "Error: %s job was never removed", data->scope_name);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool string_pure_unified_system(char *contents)
|
||||
{
|
||||
char *p;
|
||||
bool first_line_read = false;
|
||||
|
||||
lxc_iterate_parts(p, contents, "\n") {
|
||||
if (first_line_read) // if >1 line, this is not pure unified
|
||||
return false;
|
||||
first_line_read = true;
|
||||
|
||||
if (strlen(p) > 3 && strncmp(p, "0:", 2) == 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Only call get_current_unified_cgroup() when we are in a pure
|
||||
* unified (v2-only) cgroup
|
||||
*/
|
||||
static char *get_current_unified_cgroup(void)
|
||||
{
|
||||
__do_free char *buf = NULL;
|
||||
__do_free_string_list char **list = NULL;
|
||||
char *p;
|
||||
|
||||
buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
|
||||
if (!buf)
|
||||
return NULL;
|
||||
|
||||
if (!string_pure_unified_system(buf))
|
||||
return NULL;
|
||||
|
||||
// 0::/user.slice/user-1000.slice/session-136.scope
|
||||
// Get past the "0::"
|
||||
p = buf;
|
||||
if (strnequal(p, "0::", STRLITERALLEN("0::")))
|
||||
p += STRLITERALLEN("0::");
|
||||
|
||||
return strdup(p);
|
||||
}
|
||||
|
||||
static bool pure_unified_system(void)
|
||||
{
|
||||
__do_free char *buf = NULL;
|
||||
|
||||
buf = read_file_at(-EBADF, "/proc/self/cgroup", PROTECT_OPEN, 0);
|
||||
if (!buf)
|
||||
return false;
|
||||
|
||||
return string_pure_unified_system(buf);
|
||||
}
|
||||
|
||||
#define MEMBER_JOIN "AttachProcessesToUnit"
|
||||
static bool enter_scope(char *scope_name, pid_t pid)
|
||||
{
|
||||
__attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL;
|
||||
__attribute__((__cleanup__(sd_bus_error_free))) sd_bus_error error = SD_BUS_ERROR_NULL;;
|
||||
__attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *reply = NULL;
|
||||
__attribute__((__cleanup__(sd_bus_message_unrefp))) sd_bus_message *m = NULL;
|
||||
int r;
|
||||
|
||||
r = sd_bus_open_user(&bus);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed to connect to user bus: %s", strerror(-r));
|
||||
|
||||
r = sd_bus_message_new_method_call(bus, &m,
|
||||
DESTINATION, PATH, INTERFACE, MEMBER_JOIN);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed creating sdbus message");
|
||||
|
||||
r = sd_bus_message_append(m, "ssau", scope_name, "/init", 1, pid);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed setting systemd scope name");
|
||||
|
||||
|
||||
r = sd_bus_call(NULL, m, 0, &error, &reply);
|
||||
if (r < 0)
|
||||
return log_error(false, "Failed sending sdbus message: %s", error.message);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool enable_controllers_delegation(int fd_dir, char *cg)
|
||||
{
|
||||
__do_free char *rbuf = NULL;
|
||||
__do_free char *wbuf = NULL;
|
||||
__do_free_string_list char **cpulist = NULL;
|
||||
char *controller;
|
||||
size_t full_len = 0;
|
||||
bool first = true;
|
||||
int ret;
|
||||
|
||||
rbuf = read_file_at(fd_dir, "cgroup.controllers", PROTECT_OPEN, 0);
|
||||
if (!rbuf)
|
||||
return false;
|
||||
|
||||
lxc_iterate_parts(controller, rbuf, " ") {
|
||||
full_len += strlen(controller) + 2;
|
||||
wbuf = must_realloc(wbuf, full_len);
|
||||
if (first) {
|
||||
wbuf[0] = '\0';
|
||||
first = false;
|
||||
} else {
|
||||
(void)strlcat(wbuf, " ", full_len + 1);
|
||||
}
|
||||
strlcat(wbuf, "+", full_len + 1);
|
||||
strlcat(wbuf, controller, full_len + 1);
|
||||
}
|
||||
if (!wbuf)
|
||||
return log_debug(true, "No controllers to delegate!");
|
||||
|
||||
ret = lxc_writeat(fd_dir, "cgroup.subtree_control", wbuf, strlen(wbuf));
|
||||
if (ret < 0)
|
||||
return log_error_errno(false, errno, "Failed to write \"%s\" to %s/cgroup.subtree_control", wbuf, cg);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* systemd places us in say .../lxc-1.scope. We create lxc-1.scope/init,
|
||||
* move ourselves to there, then enable controllers in lxc-1.scope
|
||||
*/
|
||||
static bool move_and_delegate_unified(char *parent_cgroup)
|
||||
{
|
||||
__do_free char *buf = NULL;
|
||||
__do_close int fd_parent = -EBADF;
|
||||
int ret;
|
||||
|
||||
fd_parent = open_at(-EBADF, parent_cgroup, O_DIRECTORY, 0, 0);
|
||||
if (fd_parent < 0)
|
||||
return syserror_ret(false, "Failed opening cgroup dir \"%s\"", parent_cgroup);
|
||||
|
||||
ret = mkdirat(fd_parent, "init", 0755);
|
||||
if (ret < 0 && errno != EEXIST)
|
||||
return syserror_ret(false, "Failed to create \"%d/init\" cgroup", fd_parent);
|
||||
|
||||
buf = read_file_at(fd_parent, "cgroup.procs", PROTECT_OPEN, 0);
|
||||
if (!buf)
|
||||
return false;
|
||||
|
||||
ret = lxc_writeat(fd_parent, "init/cgroup.procs", buf, strlen(buf));
|
||||
if (ret)
|
||||
return syserror_ret(false, "Failed to escape to cgroup \"init/cgroup.procs\"");
|
||||
|
||||
/* enable controllers in parent_cgroup */
|
||||
return enable_controllers_delegation(fd_parent, parent_cgroup);
|
||||
}
|
||||
|
||||
static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf)
|
||||
{
|
||||
__do_free char *full_scope_name = NULL;
|
||||
__do_free char *fs_cg_path = NULL;
|
||||
sd_event *event = NULL;
|
||||
__attribute__((__cleanup__(sd_bus_unrefp))) sd_bus *bus = NULL; // free the bus before the names it references, just to be sure
|
||||
struct sd_callback_data sd_data;
|
||||
int idx = 0;
|
||||
size_t len;
|
||||
int r;
|
||||
|
||||
if (geteuid() == 0)
|
||||
return log_info(SYSTEMD_SCOPE_UNSUPP, "Running privileged, not using a systemd unit");
|
||||
// Pure_unified_layout() can't be used as that info is not yet setup. At
|
||||
// the same time, we don't want to calculate current cgroups until after
|
||||
// we optionally enter a new systemd user scope. So let's just do a quick
|
||||
// check for pure unified cgroup system: single line /proc/self/cgroup with
|
||||
// only index '0:'
|
||||
if (!pure_unified_system())
|
||||
return log_info(SYSTEMD_SCOPE_UNSUPP, "Not in unified layout, not using a systemd unit");
|
||||
|
||||
r = sd_bus_open_user(&bus);
|
||||
if (r < 0)
|
||||
return log_error(SYSTEMD_SCOPE_FAILED, "Failed to connect to user bus: %s", strerror(-r));
|
||||
|
||||
r = sd_bus_call_method_asyncv(bus, NULL, DESTINATION, PATH, INTERFACE, "Subscribe", NULL, NULL, NULL, NULL);
|
||||
if (r < 0)
|
||||
return log_error(SYSTEMD_SCOPE_FAILED, "Failed to subscribe to signals: %s", strerror(-r));
|
||||
|
||||
sd_data.job_complete = false;
|
||||
sd_data.scope_name = NULL;
|
||||
r = sd_bus_match_signal(bus,
|
||||
NULL, // no slot
|
||||
DESTINATION, PATH, INTERFACE, "JobRemoved",
|
||||
systemd_jobremoved_callback, &sd_data);
|
||||
if (r < 0)
|
||||
return log_error(SYSTEMD_SCOPE_FAILED, "Failed to register systemd event loop signal handler: %s", strerror(-r));
|
||||
|
||||
// NEXT: create and attach event
|
||||
r = sd_event_new(&event);
|
||||
if (r < 0)
|
||||
return log_error(SYSTEMD_SCOPE_FAILED, "Failed allocating new event: %s\n", strerror(-r));
|
||||
r = sd_bus_attach_event(bus, event, SD_EVENT_PRIORITY_NORMAL);
|
||||
if (r < 0) {
|
||||
// bus won't clean up event since the attach failed
|
||||
sd_event_unrefp(&event);
|
||||
return log_error(SYSTEMD_SCOPE_FAILED, "Failed attaching event: %s\n", strerror(-r));
|
||||
}
|
||||
|
||||
// "lxc-" + (conf->name) + "-NN" + ".scope" + '\0'
|
||||
len = STRLITERALLEN("lxc-") + strlen(conf->name) + 3 + STRLITERALLEN(".scope") + 1;
|
||||
full_scope_name = malloc(len);
|
||||
if (!full_scope_name)
|
||||
return syserror("Out of memory");
|
||||
|
||||
do {
|
||||
snprintf(full_scope_name, len, "lxc-%s-%d.scope", conf->name, idx);
|
||||
sd_data.scope_name = full_scope_name;
|
||||
if (start_scope(bus, &sd_data, event)) {
|
||||
conf->cgroup_meta.systemd_scope = get_current_unified_cgroup();
|
||||
if (!conf->cgroup_meta.systemd_scope)
|
||||
return log_trace(SYSTEMD_SCOPE_FAILED, "Out of memory");
|
||||
fs_cg_path = must_make_path("/sys/fs/cgroup", conf->cgroup_meta.systemd_scope, NULL);
|
||||
if (!move_and_delegate_unified(fs_cg_path))
|
||||
return log_error(SYSTEMD_SCOPE_FAILED, "Failed delegating the controllers to our cgroup");
|
||||
return log_trace(SYSTEMD_SCOPE_SUCCESS, "Created systemd scope %s", full_scope_name);
|
||||
}
|
||||
idx++;
|
||||
} while (idx < 99);
|
||||
|
||||
return SYSTEMD_SCOPE_FAILED; // failed, let's try old-school after all
|
||||
}
|
||||
#else /* !HAVE_LIBSYSTEMD */
|
||||
static int unpriv_systemd_create_scope(struct cgroup_ops *ops, struct lxc_conf *conf)
|
||||
{
|
||||
TRACE("unpriv_systemd_create_scope: no systemd support");
|
||||
return SYSTEMD_SCOPE_UNSUPP; // not supported
|
||||
}
|
||||
#endif /* HAVE_LIBSYSTEMD */
|
||||
|
||||
// Return a duplicate of cgroup path @cg without leading /, so
|
||||
// that caller can own+free it and be certain it's not abspath.
|
||||
static char *cgroup_relpath(char *cg)
|
||||
{
|
||||
char *p;
|
||||
|
||||
if (!cg || strequal(cg, "/"))
|
||||
return NULL;
|
||||
p = strdup(deabs(cg));
|
||||
if (!p)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
return p;
|
||||
}
|
||||
|
||||
__cgfsng_ops static bool cgfsng_monitor_create(struct cgroup_ops *ops, struct lxc_handler *handler)
|
||||
{
|
||||
__do_free char *monitor_cgroup = NULL;
|
||||
@ -1176,14 +1530,19 @@ __cgfsng_ops static bool cgfsng_monitor_enter(struct cgroup_ops *ops,
|
||||
if (ret)
|
||||
return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
|
||||
|
||||
TRACE("Moved monitor into cgroup %d", h->dfd_mon);
|
||||
TRACE("Moved monitor (%d) into cgroup %d", handler->monitor_pid, h->dfd_mon);
|
||||
|
||||
if (handler->transient_pid <= 0)
|
||||
continue;
|
||||
|
||||
ret = lxc_writeat(h->dfd_mon, "cgroup.procs", transient, transient_len);
|
||||
if (ret)
|
||||
return log_error_errno(false, errno, "Failed to enter cgroup %d", h->dfd_mon);
|
||||
if (ret) {
|
||||
// TODO: probably ask systemd to do the move for us instead
|
||||
if (!handler->conf->cgroup_meta.systemd_scope)
|
||||
return log_error_errno(false, errno, "Failed to enter pid %d into cgroup %d", handler->transient_pid, h->dfd_mon);
|
||||
else
|
||||
TRACE("Failed moving transient process into cgroup %d", h->dfd_mon);
|
||||
}
|
||||
|
||||
TRACE("Moved transient process into cgroup %d", h->dfd_mon);
|
||||
|
||||
@ -2184,14 +2543,30 @@ static int cgroup_attach_create_leaf(const struct lxc_conf *conf,
|
||||
}
|
||||
|
||||
static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
|
||||
const char *lxcpath,
|
||||
int unified_fd, int *sk_fd, pid_t pid,
|
||||
bool unprivileged)
|
||||
{
|
||||
__do_close int sk = *sk_fd, target_fd0 = -EBADF, target_fd1 = -EBADF;
|
||||
char pidstr[INTTYPE_TO_STRLEN(int64_t) + 1];
|
||||
size_t pidstr_len;
|
||||
#if HAVE_LIBSYSTEMD
|
||||
__do_free char *scope = NULL;
|
||||
#endif
|
||||
ssize_t ret;
|
||||
|
||||
#if HAVE_LIBSYSTEMD
|
||||
scope = lxc_cmd_get_systemd_scope(conf->name, lxcpath);
|
||||
if (scope) {
|
||||
TRACE("%s:%s is running under systemd-created scope '%s'. Attaching...", lxcpath, conf->name, scope);
|
||||
if (enter_scope(scope, pid))
|
||||
TRACE("Successfully entered scope '%s'", scope);
|
||||
else
|
||||
ERROR("Failed entering scope '%s'", scope);
|
||||
} else {
|
||||
TRACE("%s:%s is not running under a systemd-created scope", lxcpath, conf->name);
|
||||
}
|
||||
#endif
|
||||
if (unprivileged) {
|
||||
ret = lxc_abstract_unix_recv_two_fds(sk, &target_fd0, &target_fd1);
|
||||
if (ret < 0)
|
||||
@ -2229,6 +2604,7 @@ static int cgroup_attach_move_into_leaf(const struct lxc_conf *conf,
|
||||
|
||||
struct userns_exec_unified_attach_data {
|
||||
const struct lxc_conf *conf;
|
||||
const char *lxcpath;
|
||||
int unified_fd;
|
||||
int sk_pair[2];
|
||||
pid_t pid;
|
||||
@ -2239,8 +2615,8 @@ static int cgroup_unified_attach_child_wrapper(void *data)
|
||||
{
|
||||
struct userns_exec_unified_attach_data *args = data;
|
||||
|
||||
if (!args->conf || args->unified_fd < 0 || args->pid <= 0 ||
|
||||
args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
|
||||
if (!args->conf || !args->lxcpath || args->unified_fd < 0 ||
|
||||
args->pid <= 0 || args->sk_pair[0] < 0 || args->sk_pair[1] < 0)
|
||||
return ret_errno(EINVAL);
|
||||
|
||||
close_prot_errno_disarm(args->sk_pair[0]);
|
||||
@ -2257,7 +2633,8 @@ static int cgroup_unified_attach_parent_wrapper(void *data)
|
||||
return ret_errno(EINVAL);
|
||||
|
||||
close_prot_errno_disarm(args->sk_pair[1]);
|
||||
return cgroup_attach_move_into_leaf(args->conf, args->unified_fd,
|
||||
return cgroup_attach_move_into_leaf(args->conf, args->lxcpath,
|
||||
args->unified_fd,
|
||||
&args->sk_pair[0], args->pid,
|
||||
args->unprivileged);
|
||||
}
|
||||
@ -2286,6 +2663,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
|
||||
ret = cgroup_attach(conf, name, lxcpath, pid);
|
||||
if (ret == 0)
|
||||
return log_trace(0, "Attached to unified cgroup via command handler");
|
||||
TRACE("__cg_unified_attach: cgroup_attach returned %d", ret);
|
||||
if (!ERRNO_IS_NOT_SUPPORTED(ret) && ret != -ENOCGROUP2)
|
||||
return log_error_errno(ret, errno, "Failed to attach to unified cgroup");
|
||||
|
||||
@ -2294,6 +2672,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
|
||||
/* not running */
|
||||
if (!cgroup)
|
||||
return 0;
|
||||
TRACE("lxc_cmd_get_cgroup_path returned %s", cgroup);
|
||||
|
||||
path = make_cgroup_path(h, cgroup, NULL);
|
||||
|
||||
@ -2307,6 +2686,7 @@ static int __cg_unified_attach(const struct hierarchy *h,
|
||||
.unified_fd = unified_fd,
|
||||
.pid = pid,
|
||||
.unprivileged = am_guest_unpriv(),
|
||||
.lxcpath = lxcpath,
|
||||
};
|
||||
|
||||
ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
|
||||
@ -3152,12 +3532,19 @@ static const char *stable_order(const char *controllers)
|
||||
#define CGFSNG_LAYOUT_UNIFIED BIT(1)
|
||||
|
||||
static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
|
||||
bool unprivileged)
|
||||
bool unprivileged, struct lxc_conf *conf)
|
||||
{
|
||||
__do_free char *cgroup_info = NULL;
|
||||
unsigned int layout_mask = 0;
|
||||
int ret;
|
||||
char *it;
|
||||
|
||||
ret = unpriv_systemd_create_scope(ops, conf);
|
||||
if (ret < 0)
|
||||
return ret_set_errno(false, ret);
|
||||
else if (ret == 0)
|
||||
TRACE("Entered an unpriv systemd scope");
|
||||
|
||||
/*
|
||||
* Root spawned containers escape the current cgroup, so use init's
|
||||
* cgroups as our base in that case.
|
||||
@ -3175,7 +3562,7 @@ static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
|
||||
__do_free_string_list char **controller_list = NULL,
|
||||
**delegate = NULL;
|
||||
char *line;
|
||||
int dfd, ret, type;
|
||||
int dfd, type;
|
||||
|
||||
/* Handle the unified cgroup hierarchy. */
|
||||
line = it;
|
||||
@ -3185,7 +3572,10 @@ static int __initialize_cgroups(struct cgroup_ops *ops, bool relative,
|
||||
type = UNIFIED_HIERARCHY;
|
||||
layout_mask |= CGFSNG_LAYOUT_UNIFIED;
|
||||
|
||||
current_cgroup = current_unified_cgroup(relative, line);
|
||||
if (conf->cgroup_meta.systemd_scope)
|
||||
current_cgroup = cgroup_relpath(conf->cgroup_meta.systemd_scope);
|
||||
if (IS_ERR_OR_NULL(current_cgroup))
|
||||
current_cgroup = current_unified_cgroup(relative, line);
|
||||
if (IS_ERR(current_cgroup))
|
||||
return PTR_ERR(current_cgroup);
|
||||
|
||||
@ -3429,7 +3819,7 @@ static int initialize_cgroups(struct cgroup_ops *ops, struct lxc_conf *conf)
|
||||
*/
|
||||
ops->dfd_mnt = dfd;
|
||||
|
||||
ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map));
|
||||
ret = __initialize_cgroups(ops, conf->cgroup_meta.relative, !list_empty(&conf->id_map), conf);
|
||||
if (ret < 0)
|
||||
return syserror_ret(ret, "Failed to initialize cgroups");
|
||||
|
||||
@ -3502,7 +3892,7 @@ struct cgroup_ops *cgroup_ops_init(struct lxc_conf *conf)
|
||||
return move_ptr(cgfsng_ops);
|
||||
}
|
||||
|
||||
static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_t pid)
|
||||
static int __unified_attach_fd(const struct lxc_conf *conf, const char *lxcpath, int fd_unified, pid_t pid)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -3512,6 +3902,7 @@ static int __unified_attach_fd(const struct lxc_conf *conf, int fd_unified, pid_
|
||||
.unified_fd = fd_unified,
|
||||
.pid = pid,
|
||||
.unprivileged = am_guest_unpriv(),
|
||||
.lxcpath = lxcpath,
|
||||
};
|
||||
|
||||
ret = socketpair(PF_LOCAL, SOCK_STREAM | SOCK_CLOEXEC, 0, args.sk_pair);
|
||||
@ -3555,7 +3946,7 @@ static int __cgroup_attach_many(const struct lxc_conf *conf, const char *name,
|
||||
int dfd_con = ctx->fd[idx];
|
||||
|
||||
if (unified_cgroup_fd(dfd_con))
|
||||
ret = __unified_attach_fd(conf, dfd_con, pid);
|
||||
ret = __unified_attach_fd(conf, lxcpath, dfd_con, pid);
|
||||
else
|
||||
ret = lxc_writeat(dfd_con, "cgroup.procs", pidstr, pidstr_len);
|
||||
if (ret)
|
||||
@ -3580,7 +3971,7 @@ static int __cgroup_attach_unified(const struct lxc_conf *conf, const char *name
|
||||
if (dfd_unified < 0)
|
||||
return ret_errno(ENOSYS);
|
||||
|
||||
return __unified_attach_fd(conf, dfd_unified, pid);
|
||||
return __unified_attach_fd(conf, lxcpath, dfd_unified, pid);
|
||||
}
|
||||
|
||||
int cgroup_attach(const struct lxc_conf *conf, const char *name,
|
||||
|
@ -89,6 +89,7 @@ static const char *lxc_cmd_str(lxc_cmd_t cmd)
|
||||
[LXC_CMD_GET_CGROUP_CTX] = "get_cgroup_ctx",
|
||||
[LXC_CMD_GET_CGROUP_FD] = "get_cgroup_fd",
|
||||
[LXC_CMD_GET_LIMIT_CGROUP_FD] = "get_limit_cgroup_fd",
|
||||
[LXC_CMD_GET_SYSTEMD_SCOPE] = "get_systemd_scope",
|
||||
};
|
||||
|
||||
if (cmd >= LXC_CMD_MAX)
|
||||
@ -1316,6 +1317,55 @@ static int lxc_cmd_get_lxcpath_callback(int fd, struct lxc_cmd_req *req,
|
||||
return lxc_cmd_rsp_send_reap(fd, &rsp);
|
||||
}
|
||||
|
||||
char *lxc_cmd_get_systemd_scope(const char *name, const char *lxcpath)
|
||||
{
|
||||
bool stopped = false;
|
||||
ssize_t ret;
|
||||
struct lxc_cmd_rr cmd;
|
||||
|
||||
lxc_cmd_init(&cmd, LXC_CMD_GET_SYSTEMD_SCOPE);
|
||||
|
||||
ret = lxc_cmd(name, &cmd, &stopped, lxcpath, NULL);
|
||||
if (ret < 0)
|
||||
return NULL;
|
||||
|
||||
if (cmd.rsp.ret == 0)
|
||||
return cmd.rsp.data;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static int lxc_cmd_get_systemd_scope_callback(int fd, struct lxc_cmd_req *req,
|
||||
struct lxc_handler *handler,
|
||||
struct lxc_async_descr *descr)
|
||||
{
|
||||
__do_free char *scope = NULL;
|
||||
struct lxc_cmd_rsp rsp = {
|
||||
.ret = -EINVAL,
|
||||
};
|
||||
|
||||
// cgroup_meta.systemd_scope is the full cgroup path to the scope.
|
||||
// The caller just wants the actual scope name, that is, basename().
|
||||
// (XXX - or do we want the caller to massage it? I'm undecided)
|
||||
if (handler->conf->cgroup_meta.systemd_scope) {
|
||||
scope = strrchr(handler->conf->cgroup_meta.systemd_scope, '/');
|
||||
if (scope && *scope)
|
||||
scope++;
|
||||
if (scope && *scope)
|
||||
scope = strdup(scope);
|
||||
}
|
||||
|
||||
if (!scope)
|
||||
goto out;
|
||||
|
||||
rsp.ret = 0;
|
||||
rsp.data = scope;
|
||||
rsp.datalen = strlen(scope) + 1;
|
||||
|
||||
out:
|
||||
return lxc_cmd_rsp_send_reap(fd, &rsp);
|
||||
}
|
||||
|
||||
int lxc_cmd_add_state_client(const char *name, const char *lxcpath,
|
||||
lxc_state_t states[static MAX_STATE],
|
||||
int *state_client_fd)
|
||||
@ -1900,6 +1950,7 @@ static int lxc_cmd_process(int fd, struct lxc_cmd_req *req,
|
||||
[LXC_CMD_GET_CGROUP_CTX] = lxc_cmd_get_cgroup_ctx_callback,
|
||||
[LXC_CMD_GET_CGROUP_FD] = lxc_cmd_get_cgroup_fd_callback,
|
||||
[LXC_CMD_GET_LIMIT_CGROUP_FD] = lxc_cmd_get_limit_cgroup_fd_callback,
|
||||
[LXC_CMD_GET_SYSTEMD_SCOPE] = lxc_cmd_get_systemd_scope_callback,
|
||||
};
|
||||
|
||||
if (req->cmd >= LXC_CMD_MAX)
|
||||
|
@ -52,6 +52,7 @@ typedef enum {
|
||||
LXC_CMD_GET_CGROUP_CTX = 23,
|
||||
LXC_CMD_GET_CGROUP_FD = 24,
|
||||
LXC_CMD_GET_LIMIT_CGROUP_FD = 25,
|
||||
LXC_CMD_GET_SYSTEMD_SCOPE = 26,
|
||||
LXC_CMD_MAX,
|
||||
} lxc_cmd_t;
|
||||
|
||||
@ -115,6 +116,7 @@ __hidden extern char *lxc_cmd_get_config_item(const char *name, const char *item
|
||||
const char *lxcpath);
|
||||
__hidden extern char *lxc_cmd_get_name(const char *hashed_sock);
|
||||
__hidden extern char *lxc_cmd_get_lxcpath(const char *hashed_sock);
|
||||
__hidden extern char *lxc_cmd_get_systemd_scope(const char *name, const char *lxcpath);
|
||||
__hidden extern pid_t lxc_cmd_get_init_pid(const char *name, const char *lxcpath);
|
||||
__hidden extern int lxc_cmd_get_init_pidfd(const char *name, const char *lxcpath);
|
||||
__hidden extern int lxc_cmd_get_state(const char *name, const char *lxcpath);
|
||||
|
@ -4831,6 +4831,7 @@ void lxc_conf_free(struct lxc_conf *conf)
|
||||
free(conf->cgroup_meta.container_dir);
|
||||
free(conf->cgroup_meta.namespace_dir);
|
||||
free(conf->cgroup_meta.controllers);
|
||||
free(conf->cgroup_meta.systemd_scope);
|
||||
free(conf->shmount.path_host);
|
||||
free(conf->shmount.path_cont);
|
||||
free(conf);
|
||||
|
@ -74,6 +74,13 @@ struct lxc_cgroup {
|
||||
char *container_dir;
|
||||
char *namespace_dir;
|
||||
bool relative;
|
||||
/* If an unpriv user in pure unified-only hierarchy
|
||||
* starts a container, then we ask systemd to create
|
||||
* a scope for us, and create the monitor and container
|
||||
* cgroups under that.
|
||||
* This will ignore the above things like monitor_dir
|
||||
*/
|
||||
char *systemd_scope;
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -24,7 +24,7 @@ mkdir -p $OUT
|
||||
apt-get update -qq
|
||||
apt-get install --yes --no-install-recommends \
|
||||
build-essential docbook2x doxygen git \
|
||||
wget xz-utils systemd-coredump pkgconf
|
||||
wget xz-utils systemd-coredump pkgconf libsystemd-dev
|
||||
apt-get remove --yes lxc-utils liblxc-common liblxc1 liblxc-dev
|
||||
|
||||
# make sure we have a new enough meson version
|
||||
|
Loading…
Reference in New Issue
Block a user