From 66ae2d947ea463c53de1bf2ed5491c9926694f02 Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Mon, 20 Jul 2020 01:37:15 +0100 Subject: [PATCH] jail: re-implement /proc/sys/net read-write in netns hack Hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only which cannot be expressed with OCI spec, but happends to be very useful. Only apply it if '/proc/sys' is not already listed as mount, maskedPath or readonlyPath. Signed-off-by: Daniel Golle --- jail/fs.c | 40 +++++++++++++++++++++++++++++++++++----- jail/fs.h | 2 ++ jail/jail.c | 27 +++++++++++++++++++++++++-- 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/jail/fs.c b/jail/fs.c index e510c8b..fb0f504 100644 --- a/jail/fs.c +++ b/jail/fs.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ struct mount { unsigned long mountflags; const char *optstr; int error; + bool inner; }; struct avl_tree mounts; @@ -76,11 +78,12 @@ int mkdir_p(char *dir, mode_t mask) return ret; } -static int do_mount(const char *root, const char *source, const char *target, const char *filesystemtype, - unsigned long orig_mountflags, const char *optstr, int error) +static int do_mount(const char *root, const char *orig_source, const char *target, const char *filesystemtype, + unsigned long orig_mountflags, const char *optstr, int error, bool inner) { struct stat s; char new[PATH_MAX]; + char *source = (char *)orig_source; int fd; bool is_bind = (orig_mountflags & MS_BIND); bool is_mask = (source == (void *)(-1)); @@ -91,6 +94,11 @@ static int do_mount(const char *root, const char *source, const char *target, co return error; } + if (!is_mask && orig_source && inner) { + if (asprintf(&source, "%s%s", root, orig_source) < 0) + return ENOMEM; + } + snprintf(new, sizeof(new), "%s%s", root, target?target:source); if (is_mask) { @@ -132,6 +140,9 @@ static int do_mount(const char *root, const char *source, const char *target, co if (error) ERROR("failed to mount -B %s %s: %m\n", source, new); + if (inner) + free(source); + return error; } mountflags |= MS_REMOUNT; @@ -141,17 +152,23 @@ static int do_mount(const char *root, const char *source, const char *target, co if (error) ERROR("failed to mount %s %s: %m\n", source, new); + if (inner) + free(source); + return error; } DEBUG("mount %s%s %s (%s)\n", (mountflags & MS_BIND)?"-B ":"", source, new, (mountflags & MS_RDONLY)?"ro":"rw"); + if (inner) + free(source); + return 0; } -int add_mount(const char *source, const char *target, const char *filesystemtype, - unsigned long mountflags, const char *optstr, int error) +static int _add_mount(const char *source, const char *target, const char *filesystemtype, + unsigned long mountflags, const char *optstr, int error, bool inner) { assert(target != NULL); @@ -176,6 +193,7 @@ int add_mount(const char *source, const char *target, const char *filesystemtype m->mountflags = mountflags; m->error = error; + m->inner = inner; avl_insert(&mounts, &m->avl); DEBUG("adding mount %s %s bind(%d) ro(%d) err(%d)\n", (m->source == (void*)(-1))?"mask":m->source, m->target, @@ -184,6 +202,18 @@ int add_mount(const char *source, const char *target, const char *filesystemtype return 0; } +int add_mount(const char *source, const char *target, const char *filesystemtype, + unsigned long mountflags, const char *optstr, int error) +{ + return _add_mount(source, target, filesystemtype, mountflags, optstr, error, false); +} + +int add_mount_inner(const char *source, const char *target, const char *filesystemtype, + unsigned long mountflags, const char *optstr, int error) +{ + return _add_mount(source, target, filesystemtype, mountflags, optstr, error, true); +} + int add_mount_bind(const char *path, int readonly, int error) { unsigned long mountflags = MS_BIND; @@ -386,7 +416,7 @@ int mount_all(const char *jailroot) { add_mount_bind(l->path, 1, -1); avl_for_each_element(&mounts, m, avl) - if (do_mount(jailroot, m->source, m->target, m->filesystemtype, m->mountflags, m->optstr, m->error)) + if (do_mount(jailroot, m->source, m->target, m->filesystemtype, m->mountflags, m->optstr, m->error, m->inner)) return -1; return 0; diff --git a/jail/fs.h b/jail/fs.h index e7283a3..f94d8b1 100644 --- a/jail/fs.h +++ b/jail/fs.h @@ -19,6 +19,8 @@ int mkdir_p(char *dir, mode_t mask); int add_mount(const char *source, const char *target, const char *filesystemtype, unsigned long mountflags, const char *optstr, int error); +int add_mount_inner(const char *source, const char *target, const char *filesystemtype, + unsigned long mountflags, const char *optstr, int error); int add_mount_bind(const char *path, int readonly, int error); int parseOCImount(struct blob_attr *msg); int add_path_and_deps(const char *path, int readonly, int error, int lib); diff --git a/jail/jail.c b/jail/jail.c index a280944..5a6802f 100644 --- a/jail/jail.c +++ b/jail/jail.c @@ -2216,14 +2216,37 @@ int main(int argc, char **argv) add_mount(NULL, "/dev", "tmpfs", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "size=1M", -1); add_mount(NULL, "/dev/pts", "devpts", MS_NOATIME | MS_NOEXEC | MS_NOSUID, "newinstance,ptmxmode=0666,mode=0620,gid=5", 0); - if (opts.procfs || jsonfile) - add_mount("proc", "/proc", "proc", MS_RDONLY | MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL, -1); + if (opts.procfs || jsonfile) { + add_mount("proc", "/proc", "proc", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID, NULL, -1); + + /* + * hack to make /proc/sys/net read-write while the rest of /proc/sys is read-only + * which cannot be expressed with OCI spec, but happends to be very useful. + * Only apply it if '/proc/sys' is not already listed as mount, maskedPath or + * readonlyPath. + * If not running in a new network namespace, only make /proc/sys read-only. + * If running in a new network namespace, temporarily stash (ie. mount-bind) + * /proc/sys/net into (totally unrelated, but surely existing) /proc/self/net. + * Then we mount-bind /proc/sys read-only and then mount-move /proc/self/net into + * /proc/sys/net. + * This works because mounts are executed in incrementing strcmp() order and + * /proc/self/net appears there before /proc/sys/net and hence the operation + * succeeds as the bind-mount of /proc/self/net is performed first and then + * move-mount of /proc/sys/net follows because 'e' preceeds 'y' in the ASCII + * table (and in the alphabet). + */ + if (!add_mount(NULL, "/proc/sys", NULL, MS_BIND | MS_RDONLY, NULL, -1)) + if (opts.namespace & CLONE_NEWNET) + if (!add_mount_inner("/proc/self/net", "/proc/sys/net", NULL, MS_MOVE, NULL, -1)) + add_mount_inner("/proc/sys/net", "/proc/self/net", NULL, MS_BIND, NULL, -1); + } if (opts.sysfs || jsonfile) add_mount("sysfs", "/sys", "sysfs", MS_NOATIME | MS_NODEV | MS_NOEXEC | MS_NOSUID | MS_RDONLY, NULL, -1); if (jsonfile) add_mount("shm", "/dev/shm", "tmpfs", MS_NOSUID | MS_NOEXEC | MS_NODEV, "mode=1777", -1); + } if (pipe(&pipes[0]) < 0 || pipe(&pipes[2]) < 0) -- 2.30.2