mirror of
https://git.proxmox.com/git/mirror_lxc
synced 2025-08-17 22:05:27 +00:00

Long story behind this. Many years ago, Stéphane Graber
discovered an issue with apparmor mount rules.
Since
7f2b13275d
commit ("apparmor: Update mount states handling") it was prohibited
to change mount propagation flags, just because adding rules which
allow mount propagation user inside the container gets an ability
to mount everything [1].
Now with modern systemd versions this problem become more critical than
before. For instance, ArchLinux containers fail to start without
nesting apparmor profile enabled (because nesting profile effectively
just allow all mounts). Of course, that's a security issue.
We've also enabled sharing on the container rootfs:
https://github.com/lxc/lxc/pull/4229
Now for many workloads it's needed to change propagation flag to
private (see https://github.com/canonical/craft-parts/pull/400).
Issue:
$ lxc-start -F archlinux-test
systemd 253-1-arch running in system mode (+PAM +AUDIT -SELINUX -APPARMOR -IMA +SMACK +SECCOMP +GCRYPT +GNUTLS +OPENSSL +ACL +BLKID +CURL +ELFUTILS +FIDO2 +IDN2 -IDN +IPTC +KMOD +LIBCRYPTSETUP +LIBFDISK +PCRE2 -PWQUALITY +P11KIT -QRENCODE +TPM2 +BZIP2 +LZ4 +XZ +ZLIB +ZSTD +BPF_FRAMEWORK +XKBCOMMON +UTMP -SYSVINIT default-hierarchy=unified)
Detected virtualization lxc.
Detected architecture x86-64.
Welcome to Arch Linux!
bpf-lsm: BPF LSM hook not enabled in the kernel, BPF LSM not supported
Failed to remount root directory as MS_SLAVE: Permission denied
(sd-gens) failed with exit status 1.
[!!!!!!] Failed to start up manager.
Exiting PID 1...
Workaround (unsafe):
$ lxc-start -s lxc.apparmor.allow_nesting=1 -s lxc.apparmor.profile=generated -F arch-test
John Johansen (Apparmor maintainer) and LXD team worked on fix [2].
It was merged to stable AppArmor 3.0 and 3.1 branches already.
There is no stable AppArmor version tag for that, but I think it will
be in the AppArmor version 3.0.10.
See also:
[1] https://bugs.launchpad.net/apparmor/+bug/1597017
[2] https://gitlab.com/apparmor/apparmor/-/merge_requests/333
Fixes: #4280
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
226 lines
8.3 KiB
Plaintext
226 lines
8.3 KiB
Plaintext
network,
|
|
capability,
|
|
file,
|
|
umount,
|
|
|
|
# dbus, signal, ptrace and unix are only supported by recent apparmor
|
|
# versions. Comment them if the apparmor parser doesn't recognize them.
|
|
|
|
# This also needs additional rules to reach outside of the container via
|
|
# DBus, so just let all of DBus within the container.
|
|
dbus,
|
|
|
|
# Allow us to receive signals from anywhere. Note: if per-container profiles
|
|
# are supported, for container isolation this should be changed to something
|
|
# like:
|
|
# signal (receive) peer=unconfined,
|
|
# signal (receive) peer=/usr/bin/lxc-start,
|
|
signal (receive),
|
|
|
|
# Allow us to send signals to ourselves
|
|
signal peer=@{profile_name},
|
|
|
|
# Allow other processes to read our /proc entries, futexes, perf tracing and
|
|
# kcmp for now (they will need 'read' in the first place). Administrators can
|
|
# override with:
|
|
# deny ptrace (readby) ...
|
|
ptrace (readby),
|
|
|
|
# Allow other processes to trace us by default (they will need 'trace' in
|
|
# the first place). Administrators can override with:
|
|
# deny ptrace (tracedby) ...
|
|
ptrace (tracedby),
|
|
|
|
# Allow us to ptrace ourselves
|
|
ptrace peer=@{profile_name},
|
|
|
|
# Allow receive via unix sockets from anywhere. Note: if per-container
|
|
# profiles are supported, for container isolation this should be changed to
|
|
# something like:
|
|
# unix (receive) peer=(label=unconfined),
|
|
unix (receive),
|
|
|
|
# Allow all unix in the container
|
|
unix peer=(label=@{profile_name}),
|
|
|
|
# ignore DENIED message on / remount
|
|
deny mount options=(ro, remount) -> /,
|
|
deny mount options=(ro, remount, silent) -> /,
|
|
|
|
# allow tmpfs mounts everywhere
|
|
mount fstype=tmpfs,
|
|
|
|
# allow hugetlbfs mounts everywhere
|
|
mount fstype=hugetlbfs,
|
|
|
|
# allow mqueue mounts everywhere
|
|
mount fstype=mqueue,
|
|
|
|
# allow fuse mounts everywhere
|
|
mount fstype=fuse,
|
|
mount fstype=fuse.*,
|
|
|
|
# deny access under /proc/bus to avoid e.g. messing with pci devices directly
|
|
deny @{PROC}/bus/** wklx,
|
|
|
|
# deny writes in /proc/sys/fs but allow binfmt_misc to be mounted
|
|
mount fstype=binfmt_misc -> /proc/sys/fs/binfmt_misc/,
|
|
deny @{PROC}/sys/fs/** wklx,
|
|
|
|
# allow efivars to be mounted, writing to it will be blocked though
|
|
mount fstype=efivarfs -> /sys/firmware/efi/efivars/,
|
|
|
|
# block some other dangerous paths
|
|
deny @{PROC}/kcore rwklx,
|
|
deny @{PROC}/sysrq-trigger rwklx,
|
|
|
|
# deny writes in /sys except for /sys/fs/cgroup, also allow
|
|
# fusectl, securityfs and debugfs to be mounted there (read-only)
|
|
mount fstype=fusectl -> /sys/fs/fuse/connections/,
|
|
mount fstype=securityfs -> /sys/kernel/security/,
|
|
mount fstype=debugfs -> /sys/kernel/debug/,
|
|
deny mount fstype=debugfs -> /var/lib/ureadahead/debugfs/,
|
|
mount fstype=proc -> /proc/,
|
|
mount fstype=sysfs -> /sys/,
|
|
mount options=(rw, nosuid, nodev, noexec, remount) -> /sys/,
|
|
deny /sys/firmware/efi/efivars/** rwklx,
|
|
deny /sys/kernel/security/** rwklx,
|
|
mount options=(move) /sys/fs/cgroup/cgmanager/ -> /sys/fs/cgroup/cgmanager.lower/,
|
|
mount options=(ro, nosuid, nodev, noexec, remount, strictatime) -> /sys/fs/cgroup/,
|
|
|
|
# deny reads from debugfs
|
|
deny /sys/kernel/debug/{,**} rwklx,
|
|
|
|
# allow paths to be made slave, shared, private or unbindable
|
|
mount options=(rw,make-slave) -> **,
|
|
mount options=(rw,make-rslave) -> **,
|
|
mount options=(rw,make-shared) -> **,
|
|
mount options=(rw,make-rshared) -> **,
|
|
mount options=(rw,make-private) -> **,
|
|
mount options=(rw,make-rprivate) -> **,
|
|
mount options=(rw,make-unbindable) -> **,
|
|
mount options=(rw,make-runbindable) -> **,
|
|
|
|
# allow bind-mounts of anything except /proc, /sys and /dev
|
|
mount options=(rw,bind) /[^spd]*{,/**},
|
|
mount options=(rw,bind) /d[^e]*{,/**},
|
|
mount options=(rw,bind) /de[^v]*{,/**},
|
|
mount options=(rw,bind) /dev/.[^l]*{,/**},
|
|
mount options=(rw,bind) /dev/.l[^x]*{,/**},
|
|
mount options=(rw,bind) /dev/.lx[^c]*{,/**},
|
|
mount options=(rw,bind) /dev/.lxc?*{,/**},
|
|
mount options=(rw,bind) /dev/[^.]*{,/**},
|
|
mount options=(rw,bind) /dev?*{,/**},
|
|
mount options=(rw,bind) /p[^r]*{,/**},
|
|
mount options=(rw,bind) /pr[^o]*{,/**},
|
|
mount options=(rw,bind) /pro[^c]*{,/**},
|
|
mount options=(rw,bind) /proc?*{,/**},
|
|
mount options=(rw,bind) /s[^y]*{,/**},
|
|
mount options=(rw,bind) /sy[^s]*{,/**},
|
|
mount options=(rw,bind) /sys?*{,/**},
|
|
|
|
# allow various ro-bind-*re*-mounts
|
|
mount options=(ro,remount,bind),
|
|
mount options=(ro,remount,bind,nosuid),
|
|
mount options=(ro,remount,bind,noexec),
|
|
mount options=(ro,remount,bind,nodev),
|
|
mount options=(ro,remount,bind,nosuid,noexec),
|
|
mount options=(ro,remount,bind,noexec,nodev),
|
|
mount options=(ro,remount,bind,nodev,nosuid),
|
|
mount options=(ro,remount,bind,nosuid,noexec,nodev),
|
|
|
|
# allow moving mounts except for /proc, /sys and /dev
|
|
mount options=(rw,move) /[^spd]*{,/**},
|
|
mount options=(rw,move) /d[^e]*{,/**},
|
|
mount options=(rw,move) /de[^v]*{,/**},
|
|
mount options=(rw,move) /dev/.[^l]*{,/**},
|
|
mount options=(rw,move) /dev/.l[^x]*{,/**},
|
|
mount options=(rw,move) /dev/.lx[^c]*{,/**},
|
|
mount options=(rw,move) /dev/.lxc?*{,/**},
|
|
mount options=(rw,move) /dev/[^.]*{,/**},
|
|
mount options=(rw,move) /dev?*{,/**},
|
|
mount options=(rw,move) /p[^r]*{,/**},
|
|
mount options=(rw,move) /pr[^o]*{,/**},
|
|
mount options=(rw,move) /pro[^c]*{,/**},
|
|
mount options=(rw,move) /proc?*{,/**},
|
|
mount options=(rw,move) /s[^y]*{,/**},
|
|
mount options=(rw,move) /sy[^s]*{,/**},
|
|
mount options=(rw,move) /sys?*{,/**},
|
|
|
|
# generated by: lxc-generate-aa-rules.py container-rules.base
|
|
deny /proc/sys/[^kn]*{,/**} wklx,
|
|
deny /proc/sys/k[^e]*{,/**} wklx,
|
|
deny /proc/sys/ke[^r]*{,/**} wklx,
|
|
deny /proc/sys/ker[^n]*{,/**} wklx,
|
|
deny /proc/sys/kern[^e]*{,/**} wklx,
|
|
deny /proc/sys/kerne[^l]*{,/**} wklx,
|
|
deny /proc/sys/kernel/[^smhd]*{,/**} wklx,
|
|
deny /proc/sys/kernel/d[^o]*{,/**} wklx,
|
|
deny /proc/sys/kernel/do[^m]*{,/**} wklx,
|
|
deny /proc/sys/kernel/dom[^a]*{,/**} wklx,
|
|
deny /proc/sys/kernel/doma[^i]*{,/**} wklx,
|
|
deny /proc/sys/kernel/domai[^n]*{,/**} wklx,
|
|
deny /proc/sys/kernel/domain[^n]*{,/**} wklx,
|
|
deny /proc/sys/kernel/domainn[^a]*{,/**} wklx,
|
|
deny /proc/sys/kernel/domainna[^m]*{,/**} wklx,
|
|
deny /proc/sys/kernel/domainnam[^e]*{,/**} wklx,
|
|
deny /proc/sys/kernel/domainname?*{,/**} wklx,
|
|
deny /proc/sys/kernel/h[^o]*{,/**} wklx,
|
|
deny /proc/sys/kernel/ho[^s]*{,/**} wklx,
|
|
deny /proc/sys/kernel/hos[^t]*{,/**} wklx,
|
|
deny /proc/sys/kernel/host[^n]*{,/**} wklx,
|
|
deny /proc/sys/kernel/hostn[^a]*{,/**} wklx,
|
|
deny /proc/sys/kernel/hostna[^m]*{,/**} wklx,
|
|
deny /proc/sys/kernel/hostnam[^e]*{,/**} wklx,
|
|
deny /proc/sys/kernel/hostname?*{,/**} wklx,
|
|
deny /proc/sys/kernel/m[^s]*{,/**} wklx,
|
|
deny /proc/sys/kernel/ms[^g]*{,/**} wklx,
|
|
deny /proc/sys/kernel/msg*/** wklx,
|
|
deny /proc/sys/kernel/s[^he]*{,/**} wklx,
|
|
deny /proc/sys/kernel/se[^m]*{,/**} wklx,
|
|
deny /proc/sys/kernel/sem*/** wklx,
|
|
deny /proc/sys/kernel/sh[^m]*{,/**} wklx,
|
|
deny /proc/sys/kernel/shm*/** wklx,
|
|
deny /proc/sys/kernel?*{,/**} wklx,
|
|
deny /proc/sys/n[^e]*{,/**} wklx,
|
|
deny /proc/sys/ne[^t]*{,/**} wklx,
|
|
deny /proc/sys/net?*{,/**} wklx,
|
|
deny /sys/[^fdc]*{,/**} wklx,
|
|
deny /sys/c[^l]*{,/**} wklx,
|
|
deny /sys/cl[^a]*{,/**} wklx,
|
|
deny /sys/cla[^s]*{,/**} wklx,
|
|
deny /sys/clas[^s]*{,/**} wklx,
|
|
deny /sys/class/[^n]*{,/**} wklx,
|
|
deny /sys/class/n[^e]*{,/**} wklx,
|
|
deny /sys/class/ne[^t]*{,/**} wklx,
|
|
deny /sys/class/net?*{,/**} wklx,
|
|
deny /sys/class?*{,/**} wklx,
|
|
deny /sys/d[^e]*{,/**} wklx,
|
|
deny /sys/de[^v]*{,/**} wklx,
|
|
deny /sys/dev[^i]*{,/**} wklx,
|
|
deny /sys/devi[^c]*{,/**} wklx,
|
|
deny /sys/devic[^e]*{,/**} wklx,
|
|
deny /sys/device[^s]*{,/**} wklx,
|
|
deny /sys/devices/[^v]*{,/**} wklx,
|
|
deny /sys/devices/v[^i]*{,/**} wklx,
|
|
deny /sys/devices/vi[^r]*{,/**} wklx,
|
|
deny /sys/devices/vir[^t]*{,/**} wklx,
|
|
deny /sys/devices/virt[^u]*{,/**} wklx,
|
|
deny /sys/devices/virtu[^a]*{,/**} wklx,
|
|
deny /sys/devices/virtua[^l]*{,/**} wklx,
|
|
deny /sys/devices/virtual/[^n]*{,/**} wklx,
|
|
deny /sys/devices/virtual/n[^e]*{,/**} wklx,
|
|
deny /sys/devices/virtual/ne[^t]*{,/**} wklx,
|
|
deny /sys/devices/virtual/net?*{,/**} wklx,
|
|
deny /sys/devices/virtual?*{,/**} wklx,
|
|
deny /sys/devices?*{,/**} wklx,
|
|
deny /sys/f[^s]*{,/**} wklx,
|
|
deny /sys/fs/[^c]*{,/**} wklx,
|
|
deny /sys/fs/c[^g]*{,/**} wklx,
|
|
deny /sys/fs/cg[^r]*{,/**} wklx,
|
|
deny /sys/fs/cgr[^o]*{,/**} wklx,
|
|
deny /sys/fs/cgro[^u]*{,/**} wklx,
|
|
deny /sys/fs/cgrou[^p]*{,/**} wklx,
|
|
deny /sys/fs/cgroup?*{,/**} wklx,
|
|
deny /sys/fs?*{,/**} wklx,
|