diff --git a/include/pathnames.h b/include/pathnames.h index 81fa405f63c..569bef17f98 100644 --- a/include/pathnames.h +++ b/include/pathnames.h @@ -204,6 +204,7 @@ /* sysctl fs paths */ #define _PATH_PROC_SYS_FS "/proc/sys/fs" #define _PATH_PROC_PIPE_MAX_SIZE _PATH_PROC_SYS_FS "/pipe-max-size" +#define _PATH_PROC_BINFMT_MISC _PATH_PROC_SYS_FS "/binfmt_misc" /* irqtop paths */ #define _PATH_PROC_INTERRUPTS "/proc/interrupts" diff --git a/sys-utils/unshare.1.adoc b/sys-utils/unshare.1.adoc index e6201e28fff..6396c4892dd 100644 --- a/sys-utils/unshare.1.adoc +++ b/sys-utils/unshare.1.adoc @@ -90,6 +90,9 @@ When *unshare* terminates, have _signame_ be sent to the forked child process. C *--mount-proc*[**=**__mountpoint__]:: Just before running the program, mount the proc filesystem at _mountpoint_ (default is _/proc_). This is useful when creating a new PID namespace. It also implies creating a new mount namespace since the _/proc_ mount would otherwise mess up existing programs on the system. The new proc filesystem is explicitly mounted as private (with *MS_PRIVATE*|*MS_REC*). +*--mount-binfmt*[**=**__mountpoint__]:: +Just before running the program, mount the binfmt_misc filesystem at _mountpoint_ (default is /proc/sys/fs/binfmt_misc). It also implies creating a new mount namespace since the binfmt_misc mount would otherwise mess up existing programs on the system. The new binfmt_misc filesystem is explicitly mounted as private (with *MS_PRIVATE*|*MS_REC*). + **--map-user=**__uid|name__:: Run the program only after the current effective user ID has been mapped to _uid_. If this option is specified multiple times, the last occurrence takes precedence. This option implies *--user*. @@ -135,6 +138,10 @@ Set the user ID which will be used in the entered namespace. *-G*, *--setgid* _gid_:: Set the group ID which will be used in the entered namespace and drop supplementary groups. +*-l*, **--load-interp=**__string__:: +Load binfmt_misc definition in the namespace (implies *--mount-binfmt*). The __string__ argument is ``:name:type:offset:magic:mask:interpreter:flags``. For more details about new binary type registration see https://www.kernel.org/doc/Documentation/admin-guide/binfmt-misc.rst. +To manage the F flag in ``flags`` with **--root** parameter, binfmt_misc is mounted twice, once before the chroot to load the interpreter from the caller filesystem and once after to make it available from the chroot userspace. + *--monotonic* _offset_:: Set the offset of *CLOCK_MONOTONIC* which will be used in the entered time namespace. This option requires unsharing a time namespace with *--time*. @@ -253,6 +260,20 @@ up 21 hours, 30 minutes up 9 years, 28 weeks, 1 day, 2 hours, 50 minutes .... +The following example execute a chroot into the directory /chroot/powerpc/jessie and install the interpreter /bin/qemu-ppc-static to execute the powerpc binaries. + +.... +$ unshare --map-root-user --fork --pid --load-interp=":qemu-ppc:M::\\x7fELF\x01\\x02\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x02\\x00\\x14:\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xfe\\xff\\xff:/bin/qemu-ppc-static:OCF" --root=/chroot/powerpc/jessie /bin/bash -l +.... + +The ``load-interp`` parameter can be read as following:: +``qemu-ppc``::: is the name of the new file created below ``/proc/sys/fs/binfmt_misc`` to register the interpreter +``M``::: defines the interpreter for a given type of magic number +``\\x7fELF\x01\\x02\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x02\\x00\\x1``::: is the magic number to recognize the file to interpret (in this case, the ELF header for PPC32) +``\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xfe\\xff\\xff``::: the mask to apply to the magic number +``/bin/qemu-ppc-static``::: the interpreter to use with the file +``OCF``::: the file is open by the kernel with credential and security tokens of the file itself and loaded as soon as we register it. + == AUTHORS mailto:dottedmag@dottedmag.net[Mikhail Gusarov], diff --git a/sys-utils/unshare.c b/sys-utils/unshare.c index 57f3b8744fb..e48e4c9f55a 100644 --- a/sys-utils/unshare.c +++ b/sys-utils/unshare.c @@ -725,6 +725,35 @@ static pid_t map_ids_from_child(int *fd, uid_t mapuser, exit(EXIT_SUCCESS); } +static int is_fixed(const char *interp) +{ + const char *flags; + + flags = strrchr(interp, ':'); + + return strchr(flags, 'F') != NULL; +} + +static void load_interp(const char *binfmt_mnt, const char *interp) +{ + int dirfd, fd; + + dirfd = open(binfmt_mnt, O_PATH | O_DIRECTORY); + if (dirfd < 0) + err(EXIT_FAILURE, _("cannot open %s"), binfmt_mnt); + + fd = openat(dirfd, "register", O_WRONLY); + if (fd < 0) + err(EXIT_FAILURE, _("cannot open %s/register"), binfmt_mnt); + + if (write_all(fd, interp, strlen(interp))) + err(EXIT_FAILURE, _("write failed %s/register"), binfmt_mnt); + + close(fd); + + close(dirfd); +} + static void __attribute__((__noreturn__)) usage(void) { FILE *out = stdout; @@ -760,6 +789,7 @@ static void __attribute__((__noreturn__)) usage(void) fputs(_(" --kill-child[=] when dying, kill the forked child (implies --fork)\n" " defaults to SIGKILL\n"), out); fputs(_(" --mount-proc[=] mount proc filesystem first (implies --mount)\n"), out); + fputs(_(" --mount-binfmt[=] mount binfmt filesystem first (implies --user and --mount)\n"), out); fputs(_(" --propagation slave|shared|private|unchanged\n" " modify mount propagation in mount namespace\n"), out); fputs(_(" --setgroups allow|deny control the setgroups syscall in user namespaces\n"), out); @@ -771,6 +801,7 @@ static void __attribute__((__noreturn__)) usage(void) fputs(_(" -G, --setgid set gid in entered namespace\n"), out); fputs(_(" --monotonic set clock monotonic offset (seconds) in time namespaces\n"), out); fputs(_(" --boottime set clock boottime offset (seconds) in time namespaces\n"), out); + fputs(_(" -l, --load-interp load binfmt definition in the namespace (implies --mount-binfmt)\n"), out); fputs(USAGE_SEPARATOR, out); fprintf(out, USAGE_HELP_OPTIONS(27)); @@ -783,6 +814,7 @@ int main(int argc, char *argv[]) { enum { OPT_MOUNTPROC = CHAR_MAX + 1, + OPT_MOUNTBINFMT, OPT_PROPAGATION, OPT_SETGROUPS, OPT_KILLCHILD, @@ -811,6 +843,7 @@ int main(int argc, char *argv[]) { "fork", no_argument, NULL, 'f' }, { "kill-child", optional_argument, NULL, OPT_KILLCHILD }, { "mount-proc", optional_argument, NULL, OPT_MOUNTPROC }, + { "mount-binfmt", optional_argument, NULL, OPT_MOUNTBINFMT }, { "map-user", required_argument, NULL, OPT_MAPUSER }, { "map-users", required_argument, NULL, OPT_MAPUSERS }, { "map-group", required_argument, NULL, OPT_MAPGROUP }, @@ -827,6 +860,7 @@ int main(int argc, char *argv[]) { "wd", required_argument, NULL, 'w' }, { "monotonic", required_argument, NULL, OPT_MONOTONIC }, { "boottime", required_argument, NULL, OPT_BOOTTIME }, + { "load-interp", required_argument, NULL, 'l' }, { NULL, 0, NULL, 0 } }; @@ -839,9 +873,11 @@ int main(int argc, char *argv[]) struct map_range *groupmap = NULL; int kill_child_signo = 0; /* 0 means --kill-child was not used */ const char *procmnt = NULL; + const char *binfmt_mnt = NULL; const char *newroot = NULL; const char *newdir = NULL; pid_t pid_bind = 0, pid_idmap = 0; + const char *newinterp = NULL; pid_t pid = 0; #ifdef UL_HAVE_PIDFD int fd_parent_pid = -1; @@ -864,7 +900,7 @@ int main(int argc, char *argv[]) textdomain(PACKAGE); close_stdout_atexit(); - while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:c", longopts, NULL)) != -1) { + while ((c = getopt_long(argc, argv, "+fhVmuinpCTUrR:w:S:G:cl:", longopts, NULL)) != -1) { switch (c) { case 'f': forkit = 1; @@ -913,6 +949,15 @@ int main(int argc, char *argv[]) unshare_flags |= CLONE_NEWNS; procmnt = optarg ? optarg : "/proc"; break; + case OPT_MOUNTBINFMT: + unshare_flags |= CLONE_NEWNS | CLONE_NEWUSER; + binfmt_mnt = optarg; + if (!binfmt_mnt) { + if (!procmnt) + procmnt = "/proc"; + binfmt_mnt = _PATH_PROC_BINFMT_MISC; + } + break; case OPT_MAPUSER: unshare_flags |= CLONE_NEWUSER; mapuser = get_user(optarg, _("failed to parse uid")); @@ -998,6 +1043,15 @@ int main(int argc, char *argv[]) boottime = strtos64_or_err(optarg, _("failed to parse boottime offset")); force_boottime = 1; break; + case 'l': + unshare_flags |= CLONE_NEWNS | CLONE_NEWUSER; + if (!binfmt_mnt) { + if (!procmnt) + procmnt = "/proc"; + binfmt_mnt = _PATH_PROC_BINFMT_MISC; + } + newinterp = optarg; + break; case 'h': usage(); @@ -1152,6 +1206,13 @@ int main(int argc, char *argv[]) if ((unshare_flags & CLONE_NEWNS) && propagation) set_propagation(propagation); + if (newinterp && is_fixed(newinterp) && newroot) { + if (mount("binfmt_misc", _PATH_PROC_BINFMT_MISC, "binfmt_misc", + MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0) + err(EXIT_FAILURE, _("mount %s failed"), _PATH_PROC_BINFMT_MISC); + load_interp(_PATH_PROC_BINFMT_MISC, newinterp); + } + if (newroot) { if (chroot(newroot) != 0) err(EXIT_FAILURE, @@ -1178,6 +1239,14 @@ int main(int argc, char *argv[]) err(EXIT_FAILURE, _("mount %s failed"), procmnt); } + if (binfmt_mnt) { + if (mount("binfmt_misc", binfmt_mnt, "binfmt_misc", + MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL) != 0) + err(EXIT_FAILURE, _("mount %s failed"), binfmt_mnt); + } + if (newinterp && !(is_fixed(newinterp) && newroot)) + load_interp(binfmt_mnt, newinterp); + if (force_gid) { if (setgroups(0, NULL) != 0) /* drop supplementary groups */ err(EXIT_FAILURE, _("setgroups failed"));