From 56633d0462874e09c28782bcc83b20805ede383e Mon Sep 17 00:00:00 2001
From: "Franklin \"Snaipe\" Mathieu" <snaipe@arista.com>
Date: Thu, 19 May 2022 01:15:36 +0200
Subject: [PATCH 1/3] seccomp: add syscall emulation for safe syscalls, like
 mknod of /dev/null devices.

---
 arch.h                    |  21 ++
 arch/x86/gen-syscall.bash | 130 +++++++++++
 arch/x86/syscall.c        |  35 +++
 arch/x86/syscall.h        |  16 ++
 arch/x86_64               |   1 +
 capable.h                 |   1 +
 config.h.in               |   7 +-
 enter.c                   |  14 ++
 meson.build               |  17 ++
 outer.c                   |  11 +
 proc.c                    |  31 +++
 proc.h                    |  16 ++
 sec.c                     | 477 ++++++++++++++++++++++++++++++++++++++
 sec.h                     |  15 ++
 test/seccomp.t            |   6 +
 15 files changed, 797 insertions(+), 1 deletion(-)
 create mode 100644 arch.h
 create mode 100755 arch/x86/gen-syscall.bash
 create mode 100644 arch/x86/syscall.c
 create mode 100644 arch/x86/syscall.h
 create mode 120000 arch/x86_64
 create mode 100644 proc.c
 create mode 100644 proc.h
 create mode 100644 sec.c
 create mode 100644 sec.h
 create mode 100755 test/seccomp.t

diff --git a/arch.h b/arch.h
new file mode 100644
index 0000000..c9ccbe4
--- /dev/null
+++ b/arch.h
@@ -0,0 +1,21 @@
+/* Copyright © 2024 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#ifndef ARCH_H_
+# define ARCH_H_
+
+# include "config.h"
+
+# define ARCH_STR_(x) #x
+# define ARCH_STR(x) ARCH_STR_(x)
+
+/* *INDENT-OFF* - formatters try to add spaces here */
+# define ARCH_HEADER_BASE arch/ARCH
+/* *INDENT-ON* */
+
+# include ARCH_STR(ARCH_HEADER_BASE/syscall.h)
+
+#endif /* !ARCH_H_ */
diff --git a/arch/x86/gen-syscall.bash b/arch/x86/gen-syscall.bash
new file mode 100755
index 0000000..3502ed9
--- /dev/null
+++ b/arch/x86/gen-syscall.bash
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+# This script generates the classic BPF program to intercept system calls
+# in x86 userspace.
+
+# From asm/unistd_64.h
+declare -A x86_64_syscalls=(
+	["mknod"]="133"
+	["mknodat"]="259"
+)
+
+# From asm/unistd_32.h
+declare -A i386_syscalls=(
+	["mknod"]="14"
+	["mknodat"]="297"
+)
+
+prelude=(
+	# Check that we're running on x86_64 or i386
+	'BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch)))'
+	'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_I386, $(($i386_offset-2)), 0)'
+	'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0)'
+	'BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)'
+
+	# The x32 ABI (not to be confused with the i386 ABI!) uses the
+	# same system call numbers as x86_64, but set bit 30. Clear it so we share
+	# the same table.
+	'BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr)))'
+	'BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1)'
+	'BPF_STMT(BPF_ALU | BPF_SUB | BPF_K, X32_SYSCALL_BIT)'
+)
+
+syscall_jump=(
+	'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, $nr, 0, 1)'
+	'BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF)'
+)
+
+i386_offset=$((${#prelude[@]} + ${#syscall_jump[@]}*${#x86_64_syscalls[@]} + 1))
+
+# NOTE: indentation is done with tabs. Do not use spaces, do not remove tabs,
+# lest you break all HEREDOCs.
+
+gen_source() {
+	cat <<-EOF
+	/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
+	
+	#include <stddef.h>
+	#include <linux/audit.h>
+	#include <linux/bpf_common.h>
+	#include <linux/filter.h>
+	#include <linux/seccomp.h>
+	
+	/* For the x32 ABI, all system call numbers have bit 30 set */
+	#define X32_SYSCALL_BIT 0x40000000
+	
+	const struct sock_filter syscall_filter[] = {
+	EOF
+
+	for stmt in "${prelude[@]}"; do
+		eval "echo $'\t'\"$stmt\","
+	done
+
+	for syscall in "${!x86_64_syscalls[@]}"; do
+		nr=${x86_64_syscalls[$syscall]}
+		for stmt in "${syscall_jump[@]}"; do
+			eval "echo $'\t'\"$stmt\","
+		done
+	done
+
+	echo $'\t''BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),'
+	echo $'\t''BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),'
+
+	for syscall in "${!i386_syscalls[@]}"; do
+		nr=${i386_syscalls[$syscall]}
+		for stmt in "${syscall_jump[@]}"; do
+			eval "echo $'\t'\"$stmt\","
+		done
+	done
+
+	echo $'\t''BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),'
+
+	cat <<-EOF
+	};
+	
+	const size_t syscall_filter_length = sizeof (syscall_filter) / sizeof (struct sock_filter);
+
+	/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
+	EOF
+}
+
+gen_header() {
+	cat <<-EOF
+	/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
+
+	extern const struct sock_filter syscall_filter[];
+	extern const size_t syscall_filter_length;
+
+	EOF
+
+	for syscall in "${!x86_64_syscalls[@]}"; do
+		echo "#define BST_NR_${syscall} ${x86_64_syscalls[$syscall]}"
+	done
+
+	for syscall in "${!i386_syscalls[@]}"; do
+		echo "#define BST_NR_${syscall}_32 ${i386_syscalls[$syscall]}"
+	done
+
+	max=0
+	for syscall in "${!x86_64_syscalls[@]}"; do
+		(( ${x86_64_syscalls[$syscall]} > max )) && max=${x86_64_syscalls[$syscall]}
+	done
+
+	max32=0
+	for syscall in "${!i386_syscalls[@]}"; do
+		(( ${i386_syscalls[$syscall]} > max32 )) && max32=${i386_syscalls[$syscall]}
+	done
+
+	cat <<-EOF
+
+	#define BST_SECCOMP_32 1
+
+	#define BST_NR_MAX $max
+	#define BST_NR_MAX32 $max32
+
+	/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
+	EOF
+}
+
+gen_source > arch/x86/syscall.c
+gen_header > arch/x86/syscall.h
diff --git a/arch/x86/syscall.c b/arch/x86/syscall.c
new file mode 100644
index 0000000..fc426c9
--- /dev/null
+++ b/arch/x86/syscall.c
@@ -0,0 +1,35 @@
+/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
+
+#include <stddef.h>
+#include <linux/audit.h>
+#include <linux/bpf_common.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+
+/* For the x32 ABI, all system call numbers have bit 30 set */
+#define X32_SYSCALL_BIT 0x40000000
+
+const struct sock_filter syscall_filter[] = {
+	BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_I386, 10, 0),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_X86_64, 1, 0),
+	BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+	BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),
+	BPF_JUMP(BPF_JMP | BPF_JGE | BPF_K, X32_SYSCALL_BIT, 0, 1),
+	BPF_STMT(BPF_ALU | BPF_SUB | BPF_K, X32_SYSCALL_BIT),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 133, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 259, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+	BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 14, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 297, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+};
+
+const size_t syscall_filter_length = sizeof (syscall_filter) / sizeof (struct sock_filter);
+
+/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
diff --git a/arch/x86/syscall.h b/arch/x86/syscall.h
new file mode 100644
index 0000000..8ed4914
--- /dev/null
+++ b/arch/x86/syscall.h
@@ -0,0 +1,16 @@
+/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
+
+extern const struct sock_filter syscall_filter[];
+extern const size_t syscall_filter_length;
+
+#define BST_NR_mknod 133
+#define BST_NR_mknodat 259
+#define BST_NR_mknod_32 14
+#define BST_NR_mknodat_32 297
+
+#define BST_SECCOMP_32 1
+
+#define BST_NR_MAX 259
+#define BST_NR_MAX32 297
+
+/* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
diff --git a/arch/x86_64 b/arch/x86_64
new file mode 120000
index 0000000..f4bad79
--- /dev/null
+++ b/arch/x86_64
@@ -0,0 +1 @@
+x86
\ No newline at end of file
diff --git a/capable.h b/capable.h
index 92e62e5..7a3d598 100644
--- a/capable.h
+++ b/capable.h
@@ -20,6 +20,7 @@
 # define BST_CAP_SETUID         ((uint64_t) 1 << CAP_SETUID)
 # define BST_CAP_SETGID         ((uint64_t) 1 << CAP_SETGID)
 # define BST_CAP_SYS_CHROOT     ((uint64_t) 1 << CAP_SYS_CHROOT)
+# define BST_CAP_MKNOD          ((uint64_t) 1 << CAP_MKNOD)
 
 extern int deny_new_capabilities;
 
diff --git a/config.h.in b/config.h.in
index 14292b9..c596351 100644
--- a/config.h.in
+++ b/config.h.in
@@ -12,8 +12,13 @@
 # define LIBEXECDIR "@libexecdir@"
 # define VERSION "@version@"
 
+#mesondefine ARCH
+#mesondefine ARCH_X86
+#mesondefine ARCH_X86_64
+
+#mesondefine HAVE_SECCOMP_UNOTIFY
+#mesondefine HAVE_SYSTEMD
 #mesondefine HAVE_SYS_mount_setattr
 #mesondefine HAVE_close_range
-#mesondefine HAVE_SYSTEMD
 
 #endif /* !CONFIG_H_ */
diff --git a/enter.c b/enter.c
index 2c8357e..64e4106 100644
--- a/enter.c
+++ b/enter.c
@@ -27,8 +27,10 @@
 #include "bst_limits.h"
 #include "capable.h"
 #include "compat.h"
+#include "config.h"
 #include "enter.h"
 #include "errutil.h"
+#include "fd.h"
 #include "mount.h"
 #include "net.h"
 #include "ns.h"
@@ -40,6 +42,10 @@
 #include "util.h"
 #include "fd.h"
 
+#ifdef HAVE_SECCOMP_UNOTIFY
+# include "sec.h"
+#endif
+
 static inline size_t append_argv(char **argv, size_t argc, char *arg)
 {
 	if (argc >= ARG_MAX) {
@@ -456,6 +462,14 @@ int enter(struct entry_settings *opts)
 	}
 	ns_enter_postfork(namespaces, ns_len);
 
+#ifdef HAVE_SECCOMP_UNOTIFY
+		int seccomp_fd = sec_seccomp_install_filter();
+		if (seccomp_fd != -1) {
+			send_fd(outer_helper.fd, seccomp_fd);
+			close(seccomp_fd);
+		}
+#endif
+
 	outer_helper_close(&outer_helper);
 
 	int rtnl = init_rtnetlink_socket();
diff --git a/meson.build b/meson.build
index ae645e5..51984f8 100644
--- a/meson.build
+++ b/meson.build
@@ -51,18 +51,26 @@ if get_option('optimization') != '0'
 		language: ['c'])
 endif
 
+arch = host_machine.cpu_family()
+
 config = configuration_data()
 config.set('package', meson.project_name())
 config.set('bindir', bindir)
 config.set('libexecdir', libexecdir)
 config.set('version', version)
 
+config.set('ARCH', arch)
+config.set('ARCH_@0@'.format(arch.to_upper()), 1)
+
 config.set('HAVE_SYS_mount_setattr', cc.has_header_symbol('syscall.h', 'SYS_mount_setattr'))
 config.set('HAVE_close_range', cc.has_function('close_range'))
 
 libdbus = dependency('dbus-1', required: false)
 config.set('HAVE_SYSTEMD', libdbus.found())
 
+has_seccomp_unotify = cc.has_header_symbol('linux/seccomp.h', 'SECCOMP_FILTER_FLAG_NEW_LISTENER')
+config.set('HAVE_SECCOMP_UNOTIFY', has_seccomp_unotify)
+
 configure_file(input: 'config.h.in', output: 'config.h', configuration: config)
 
 bst_init_sources = [
@@ -113,6 +121,14 @@ if libdbus.found()
 	bst_sources += ['cgroup_systemd.c']
 endif
 
+if has_seccomp_unotify
+	bst_sources += [
+		'arch/@0@/syscall.c'.format(arch),
+		'proc.c',
+		'sec.c',
+	]
+endif
+
 executable('bst', bst_sources, install: true, dependencies: [libdbus])
 
 if not get_option('no-setcap-or-suid')
@@ -125,6 +141,7 @@ if not get_option('no-setcap-or-suid')
 			'cap_sys_admin',
 			'cap_sys_chroot',
 			'cap_sys_ptrace',
+			'cap_mknod',
 		],
 		'bst-unpersist': [
 			'cap_sys_admin',
diff --git a/outer.c b/outer.c
index 5dc402a..860f95e 100644
--- a/outer.c
+++ b/outer.c
@@ -23,6 +23,7 @@
 #include "capable.h"
 #include "cgroup.h"
 #include "compat.h"
+#include "config.h"
 #include "enter.h"
 #include "fd.h"
 #include "outer.h"
@@ -31,6 +32,10 @@
 #include "userns.h"
 #include "util.h"
 
+#ifdef HAVE_SECCOMP_UNOTIFY
+# include "sec.h"
+#endif
+
 enum {
 	/* This should be enough for defining our mappings. If we assign
 	   340 mappings, and since each line would contain at most
@@ -404,7 +409,13 @@ void outer_helper_spawn(struct outer_helper *helper)
 	ssize_t count = write(fd, &ok, sizeof (ok));
 	assert((ssize_t)(sizeof (ok)) == count);
 
+#ifdef HAVE_SECCOMP_UNOTIFY
+	int seccomp_fd = recv_fd(fd);
+	sec_seccomp_supervisor(seccomp_fd);
+	__builtin_unreachable();
+#else
 	_exit(0);
+#endif
 }
 
 void outer_helper_sendpid(const struct outer_helper *helper, pid_t pid)
diff --git a/proc.c b/proc.c
new file mode 100644
index 0000000..af7f922
--- /dev/null
+++ b/proc.c
@@ -0,0 +1,31 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "proc.h"
+
+int proc_read_status(int procfd, struct proc_status *out)
+{
+	memset(out, 0, sizeof (*out));
+
+	int statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC);
+	if (statusfd == -1) {
+		return -1;
+	}
+
+	FILE *f = fdopen(statusfd, "r");
+
+	char line[4096];
+	while (fgets(line, sizeof (line) - 1, f)) {
+		sscanf(line, "Umask:\t%o\n", &out->umask);
+	}
+
+	fclose(f);
+	return 0;
+}
diff --git a/proc.h b/proc.h
new file mode 100644
index 0000000..c204e6b
--- /dev/null
+++ b/proc.h
@@ -0,0 +1,16 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#ifndef PROC_H_
+# define PROC_H_
+
+struct proc_status {
+	mode_t umask;
+};
+
+int proc_read_status(int procfd, struct proc_status *out);
+
+#endif /* !PROC_H_ */
diff --git a/sec.c b/sec.c
new file mode 100644
index 0000000..2d394ca
--- /dev/null
+++ b/sec.c
@@ -0,0 +1,477 @@
+/* Copyright © 2024 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <linux/audit.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sched.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/epoll.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/sysmacros.h>
+#include <syscall.h>
+#include <unistd.h>
+
+#include "arch.h"
+#include "capable.h"
+#include "proc.h"
+#include "sec.h"
+#include "util.h"
+
+typedef int syscall_handler_func(int, int, struct seccomp_notif *);
+
+enum {
+	SYSCALL_HANDLED,
+	SYSCALL_CONTINUE,
+};
+
+static int self_mnt_nsfd(void) {
+
+	static int fd = -1;
+
+	if (fd == -1) {
+		fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
+		if (fd == -1) {
+			err(1, "open /proc/self/ns/mnt");
+		}
+	}
+
+	return fd;
+}
+
+static int check_seccomp_cookie(int seccomp_fd, __u64 *id)
+{
+	return ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_ID_VALID, id);
+}
+
+static int resolve_dirfd(int procfd, int dirfd)
+{
+	int realdirfd = -1;
+	if (dirfd == AT_FDCWD) {
+		make_capable(BST_CAP_SYS_PTRACE | BST_CAP_DAC_OVERRIDE);
+		realdirfd = openat(procfd, "cwd", O_PATH | O_CLOEXEC);
+		reset_capabilities();
+	} else {
+		char fdpath[PATH_MAX+1];
+		if ((size_t) snprintf(fdpath, PATH_MAX, "fd/%d", dirfd) >= sizeof (fdpath)) {
+			warnx("fd/%d takes more than PATH_MAX bytes.", dirfd);
+			return -EINVAL;
+		}
+
+		make_capable(BST_CAP_SYS_PTRACE | BST_CAP_DAC_OVERRIDE);
+		realdirfd = openat(procfd, fdpath, O_PATH | O_CLOEXEC);
+		reset_capabilities();
+	}
+	if (realdirfd == -1) {
+		warn("open");
+		return -EINVAL;
+	}
+	return realdirfd;
+}
+
+struct arg_buf {
+	uintptr_t addr;
+	size_t size;
+	void *buf;
+};
+
+typedef int runproc_func(int procfd, void *cookie);
+
+static int run_in_process_context(int seccomp_fd, int procfd,
+		struct seccomp_notif *req,
+		struct arg_buf *in,
+		struct arg_buf *out,
+		void *cookie,
+		runproc_func *fn)
+{
+	int rc = 0;
+
+	make_capable(BST_CAP_SYS_PTRACE | BST_CAP_DAC_OVERRIDE);
+
+	int selfmnt = self_mnt_nsfd();
+	int memfd = openat(procfd, "mem", O_RDWR | O_CLOEXEC);
+	int mntns = openat(procfd, "ns/mnt", O_RDONLY | O_CLOEXEC);
+
+	reset_capabilities();
+
+	if (memfd == -1) {
+		warn("open /proc/<pid>/mem");
+		rc = -EINVAL;
+		goto error_close;
+	}
+
+	if (mntns == -1) {
+		warn("open /proc/<pid>/ns/mnt");
+		rc = -EINVAL;
+		goto error_close;
+	}
+
+	for (struct arg_buf *a = in; a && a->addr; a++) {
+		size_t total = 0;
+		while (total < a->size) {
+			ssize_t nread = pread(memfd, a->buf, a->size, a->addr);
+			if (nread == -1) {
+				warn("pread %lx:%zu", a->addr, a->size);
+				rc = -EFAULT;
+				goto error_close;
+			}
+			if (nread == 0) {
+				break;
+			}
+			total += nread;
+		}
+		a->size = total;
+	}
+
+	/* Check again that the process is alive and blocked on the syscall. This
+	   handles cases where the syscall got interrupted by a signal handler
+	   and the program state changed before we read the pathname or other
+	   information from proc. */
+
+	if (check_seccomp_cookie(seccomp_fd, &req->id) == -1) {
+		rc = -errno;
+		goto error_close;
+	}
+
+	make_capable(BST_CAP_SYS_ADMIN | BST_CAP_SYS_CHROOT);
+	int rc2 = setns(mntns, CLONE_NEWNS);
+	reset_capabilities();
+
+	if (rc2 == -1) {
+		warn("setns");
+		rc = -EOPNOTSUPP;
+		goto error;
+	}
+
+	if ((rc = fn(procfd, cookie)) == -1) {
+		goto error;
+	}
+
+	for (struct arg_buf *a = out; a && a->addr; a++) {
+		while (a->size > 0) {
+			ssize_t nwrite = pwrite(memfd, a->buf, a->size, a->addr);
+			if (nwrite == -1) {
+				warn("pwrite %lx:%zu", a->addr, a->size);
+				rc = -EFAULT;
+				goto error;
+			}
+			a->size -= nwrite;
+			a->addr += nwrite;
+		}
+	}
+
+error:
+	make_capable(BST_CAP_SYS_ADMIN | BST_CAP_SYS_CHROOT);
+	rc2 = setns(selfmnt, CLONE_NEWNS);
+	reset_capabilities();
+
+	if (rc2 == -1) {
+		err(1, "setns");
+	}
+
+error_close:
+	close(mntns);
+	close(memfd);
+	return rc;
+}
+
+struct mknodat_args {
+	int dirfd;
+	mode_t mode;
+	dev_t dev;
+	char pathname[PATH_MAX];
+};
+
+static int sec__mknodat_callback(int procfd, void *cookie)
+{
+	struct mknodat_args *args = cookie;
+
+	struct proc_status status;
+	if (proc_read_status(procfd, &status) == -1) {
+		warn("proc_read_status /proc/<pid>/status");
+		return -EINVAL;
+	}
+
+	mode_t old_umask = umask(status.umask);
+
+	make_capable(BST_CAP_MKNOD);
+
+	int rc = 0;
+	if (mknodat(args->dirfd, args->pathname, args->mode, args->dev) == -1) {
+		rc = -errno;
+	}
+
+	reset_capabilities();
+
+	if (old_umask != (mode_t) -1) {
+		umask(old_umask);
+	}
+
+	return rc;
+}
+
+static int sec__mknodat_impl(int seccomp_fd, int procfd,
+		struct seccomp_notif *req,
+		int dirfd,
+		uintptr_t pathnameaddr,
+		mode_t mode,
+		dev_t dev)
+{
+	if ((mode & S_IFCHR) == 0 || (mode & S_IFBLK) == 0) {
+		/* Fallthrough for non-privileged operations -- the caller already
+		   has the rights to do this themselves. */
+		return SYSCALL_CONTINUE;
+	}
+
+	/* Is this one of the safe devices? */
+
+	struct devtype {
+		mode_t type;
+		dev_t  dev;
+	};
+
+	const struct devtype safe_devices[] = {
+		{ .type = S_IFCHR, .dev = makedev(0, 0) }, // whiteout device
+		{ .type = S_IFCHR, .dev = makedev(1, 3) }, // null device
+		{ .type = S_IFCHR, .dev = makedev(1, 5) }, // zero device
+		{ .type = S_IFCHR, .dev = makedev(1, 7) }, // full device
+		{ .type = S_IFCHR, .dev = makedev(1, 8) }, // random device
+		{ .type = S_IFCHR, .dev = makedev(1, 9) }, // urandom device
+		{ .type = S_IFCHR, .dev = makedev(5, 0) }, // tty device
+	};
+
+	for (size_t i = 0; i < lengthof(safe_devices); i++) {
+		if ((mode & S_IFMT) == safe_devices[i].type && dev == safe_devices[i].dev) {
+			goto safe;
+		}
+	}
+	return SYSCALL_CONTINUE;
+
+safe: {}
+	/* The device is safe to create -- perform shenanigans */
+
+	int realdirfd = resolve_dirfd(procfd, dirfd);
+	if (realdirfd < 0) {
+		return realdirfd;
+	}
+
+	struct mknodat_args args = {
+		.dirfd = realdirfd,
+		.dev = dev,
+		.mode = mode,
+	};
+
+	struct arg_buf in[] = {
+		{
+			.addr = pathnameaddr,
+			.buf  = &args.pathname[0],
+			.size = PATH_MAX-1,
+		},
+		{
+			.addr = 0,
+		},
+	};
+
+	int rc = run_in_process_context(seccomp_fd, procfd, req, in, NULL, &args, sec__mknodat_callback);
+
+	close(realdirfd);
+	return rc;
+}
+
+static int sec__mknod(int seccomp_fd, int procfd, struct seccomp_notif *req)
+{
+	uintptr_t pathnameaddr = req->data.args[0];
+	mode_t mode = req->data.args[1];
+	dev_t dev = req->data.args[2];
+
+	return sec__mknodat_impl(seccomp_fd, procfd, req, AT_FDCWD, pathnameaddr, mode, dev);
+}
+
+static int sec__mknodat(int seccomp_fd, int procfd, struct seccomp_notif *req)
+{
+	int dirfd = req->data.args[0];
+	uintptr_t pathnameaddr = req->data.args[1];
+	mode_t mode = req->data.args[2];
+	dev_t dev = req->data.args[3];
+
+	return sec__mknodat_impl(seccomp_fd, procfd, req, dirfd, pathnameaddr, mode, dev);
+}
+
+static int seccomp(unsigned int op, unsigned int flags, void *args)
+{
+	return syscall(__NR_seccomp, op, flags, args);
+}
+
+int sec_seccomp_install_filter(void)
+{
+	struct sock_fprog prog = {
+		.len    = syscall_filter_length,
+		.filter = (struct sock_filter *)syscall_filter,
+	};
+
+	int fd = seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_NEW_LISTENER, &prog);
+	if (fd == -1) {
+		if (errno == EBUSY) {
+			// We're likely running bst in bst; ignore the error, and return
+			// a useless file descriptor to pass to the seccomp supervisor
+			return epoll_create1(EPOLL_CLOEXEC);
+		}
+		err(1, "seccomp SECCOMP_SET_MODE_FILTER");
+	}
+	return fd;
+}
+
+static void sec_seccomp_dispatch_syscall(int seccomp_fd,
+		struct seccomp_notif *req,
+		struct seccomp_notif_resp *resp)
+{
+	static syscall_handler_func *const syscall_table[BST_NR_MAX+1] = {
+#ifdef BST_NR_mknod
+		[BST_NR_mknod]   = sec__mknod,
+#endif
+		[BST_NR_mknodat] = sec__mknodat,
+	};
+
+#ifdef BST_SECCOMP_32
+	syscall_handler_func *syscall_table_32[BST_NR_MAX32+1] = {
+#ifdef BST_NR_mknod_32
+		[BST_NR_mknod_32]   = sec__mknod,
+#endif
+		[BST_NR_mknodat_32] = sec__mknodat,
+	};
+#endif
+
+	resp->id = req->id;
+
+	syscall_handler_func *const *table = syscall_table;
+	size_t nr_syscall = lengthof(syscall_table);
+#ifdef ARCH_X86_64
+#ifdef BST_SECCOMP_32
+	if (req->data.arch == AUDIT_ARCH_I386) {
+		table = syscall_table_32;
+		nr_syscall = lengthof(syscall_table_32);
+	}
+#endif
+	if (req->data.arch == AUDIT_ARCH_X86_64) {
+		/* x32 system calls are the same as x86_64, except they have bit 30
+		 * set; we're not making any difference here, so reset it */
+		req->data.nr &= ~0x40000000;
+	}
+#endif
+
+	if (req->data.nr <= 0 || (size_t) req->data.nr >= nr_syscall) {
+		goto fallthrough;
+	}
+	syscall_handler_func *fn = table[(size_t) req->data.nr];
+	if (!fn) {
+		goto fallthrough;
+	}
+
+	char procpath[PATH_MAX+1];
+	if ((size_t) snprintf(procpath, PATH_MAX, "/proc/%d", req->pid) >= sizeof (procpath)) {
+		errx(1, "/proc/%d takes more than PATH_MAX bytes.", req->pid);
+	}
+
+	int procfd = open(procpath, O_PATH | O_DIRECTORY | O_CLOEXEC);
+	if (procfd == -1) {
+		if (errno == ENOENT) {
+			goto fallthrough;
+		}
+		err(1, "open");
+	}
+
+	int rc = fn(seccomp_fd, procfd, req);
+	close(procfd);
+
+	if (rc < 0) {
+		resp->error = rc;
+	} else if (rc == SYSCALL_CONTINUE) {
+		goto fallthrough;
+	}
+
+send:
+	if (ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_SEND, resp) == -1) {
+		// ENOENT is normal -- this means the syscall got interrupted by a
+		// signal.
+		if (errno != ENOENT) {
+			warn("ioctl SECCOMP_IOCTL_NOTIF_SEND");
+		}
+	}
+	return;
+
+fallthrough:
+	resp->flags = SECCOMP_USER_NOTIF_FLAG_CONTINUE;
+	goto send;
+}
+
+noreturn void sec_seccomp_supervisor(int seccomp_fd)
+{
+	/* Run the seccomp supervisor. This supervisor is a privileged helper
+	   that runs safe syscalls on behalf of the unprivileged child in a
+	   user namespace.
+
+	   Use-cases include:
+	   * Allowing mknod on devices deemed "safe", like /dev/null, or the
+	     overlayfs whiteout file.
+	   * Allow devtmpfs mount with our custom bst_devtmpfs logic.
+	
+	   For now, this is intended to be a blocking loop -- if we need other
+	   long-running agents down the line we might need to consider using
+	   an epoll loop or forking these into other processes. */
+
+	struct seccomp_notif_sizes sizes;
+
+	if (seccomp(SECCOMP_GET_NOTIF_SIZES, 0, &sizes) == -1)
+		err(1, "seccomp SECCOMP_GET_NOTIF_SIZES");
+
+	struct seccomp_notif *req = malloc(sizes.seccomp_notif);
+	if (req == NULL)
+		err(1, "malloc");
+
+	/* When allocating the response buffer, we must allow for the fact
+	   that the user-space binary may have been built with user-space
+	   headers where 'struct seccomp_notif_resp' is bigger than the
+	   response buffer expected by the (older) kernel. Therefore, we
+	   allocate a buffer that is the maximum of the two sizes. This
+	   ensures that if the supervisor places bytes into the response
+	   structure that are past the response size that the kernel expects,
+	   then the supervisor is not touching an invalid memory location. */
+
+	size_t resp_size = sizes.seccomp_notif_resp;
+	if (sizeof (struct seccomp_notif_resp) > resp_size)
+		resp_size = sizeof (struct seccomp_notif_resp);
+
+	struct seccomp_notif_resp *resp = malloc(resp_size);
+	if (resp == NULL)
+		err(1, "malloc");
+
+	for (;;) {
+		memset(req,  0, sizes.seccomp_notif);
+		memset(resp, 0, resp_size);
+
+		if (ioctl(seccomp_fd, SECCOMP_IOCTL_NOTIF_RECV, req) == -1) {
+			switch (errno) {
+			case EINTR:
+				continue;
+			case ENOTTY:
+				/* seccomp running in seccomp, which is not supported/needed */
+				_exit(0);
+			}
+			err(1, "ioctl SECCOMP_IOCTL_NOTIF_RECV");
+		}
+
+		sec_seccomp_dispatch_syscall(seccomp_fd, req, resp);
+	}
+}
+
diff --git a/sec.h b/sec.h
new file mode 100644
index 0000000..1da2ce3
--- /dev/null
+++ b/sec.h
@@ -0,0 +1,15 @@
+/* Copyright © 2022 Arista Networks, Inc. All rights reserved.
+ *
+ * Use of this source code is governed by the MIT license that can be found
+ * in the LICENSE file.
+ */
+
+#ifndef SEC_H_
+# define SEC_H_
+
+# include <stdnoreturn.h>
+
+int sec_seccomp_install_filter(void);
+noreturn void sec_seccomp_supervisor(int);
+
+#endif /* !SEC_H_ */
diff --git a/test/seccomp.t b/test/seccomp.t
new file mode 100755
index 0000000..ddea559
--- /dev/null
+++ b/test/seccomp.t
@@ -0,0 +1,6 @@
+#!/usr/bin/env cram.sh
+
+mknod should work for safe devices unprivileged
+
+	$ bst mknod null c 1 3
+	> rm -f null

From 01f207fe2eb9cd3ca344e0c0b92a4adb762a3a59 Mon Sep 17 00:00:00 2001
From: "Franklin \"Snaipe\" Mathieu" <snaipe@arista.com>
Date: Mon, 21 Oct 2024 16:11:17 +0200
Subject: [PATCH 2/3] seccomp: add support for aarch64

---
 arch/aarch64/gen-syscall.bash | 89 +++++++++++++++++++++++++++++++++++
 arch/aarch64/syscall.c        | 21 +++++++++
 arch/aarch64/syscall.h        | 10 ++++
 3 files changed, 120 insertions(+)
 create mode 100755 arch/aarch64/gen-syscall.bash
 create mode 100644 arch/aarch64/syscall.c
 create mode 100644 arch/aarch64/syscall.h

diff --git a/arch/aarch64/gen-syscall.bash b/arch/aarch64/gen-syscall.bash
new file mode 100755
index 0000000..d99acd3
--- /dev/null
+++ b/arch/aarch64/gen-syscall.bash
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# This script generates the classic BPF program to intercept system calls
+# in AArch64 userspace.
+
+# From asm/unistd.h -- or you can use https://arm64.syscall.sh/ for new ones
+declare -A syscalls=(
+	["mknodat"]="33"
+)
+
+prelude=(
+	# Check that we're running on AArch64
+	'BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch)))'
+	'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 1, 0)'
+	'BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS)'
+
+	# Load syscall number
+	'BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr)))'
+)
+
+syscall_jump=(
+	'BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, $nr, 0, 1)'
+	'BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF)'
+)
+
+# NOTE: indentation is done with tabs. Do not use spaces, do not remove tabs,
+# lest you break all HEREDOCs.
+
+gen_source() {
+	cat <<-EOF
+	/* THIS FILE WAS GENERATED BY arch/aarch64/gen-syscall.bash -- DO NOT EDIT */
+	
+	#include <stddef.h>
+	#include <linux/audit.h>
+	#include <linux/bpf_common.h>
+	#include <linux/filter.h>
+	#include <linux/seccomp.h>
+	
+	const struct sock_filter syscall_filter[] = {
+	EOF
+
+	for stmt in "${prelude[@]}"; do
+		eval "echo $'\t'\"$stmt\","
+	done
+
+	for syscall in "${!syscalls[@]}"; do
+		nr=${syscalls[$syscall]}
+		for stmt in "${syscall_jump[@]}"; do
+			eval "echo $'\t'\"$stmt\","
+		done
+	done
+
+	echo $'\t''BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),'
+
+	cat <<-EOF
+	};
+	
+	const size_t syscall_filter_length = sizeof (syscall_filter) / sizeof (struct sock_filter);
+	
+	/* THIS FILE WAS GENERATED BY arch/aarch64/gen-syscall.bash -- DO NOT EDIT */
+	EOF
+}
+
+gen_header() {
+	cat <<-EOF
+	/* THIS FILE WAS GENERATED BY arch/aarch64/gen-syscall.bash -- DO NOT EDIT */
+
+	extern const struct sock_filter syscall_filter[];
+	extern const size_t syscall_filter_length;
+
+	EOF
+
+	max=0
+	for syscall in "${!syscalls[@]}"; do
+		echo "#define BST_NR_${syscall} ${syscalls[$syscall]}"
+		(( ${syscalls[$syscall]} > max )) && max=${syscalls[$syscall]}
+	done
+
+	cat <<-EOF
+
+	#define BST_NR_MAX $max
+
+	/* THIS FILE WAS GENERATED BY arch/aarch64/gen-syscall.bash -- DO NOT EDIT */
+	EOF
+}
+
+gen_source > arch/aarch64/syscall.c
+gen_header > arch/aarch64/syscall.h
+
diff --git a/arch/aarch64/syscall.c b/arch/aarch64/syscall.c
new file mode 100644
index 0000000..5c5a95a
--- /dev/null
+++ b/arch/aarch64/syscall.c
@@ -0,0 +1,21 @@
+/* THIS FILE WAS GENERATED BY arch/aarch64/gen-syscall.bash -- DO NOT EDIT */
+
+#include <stddef.h>
+#include <linux/audit.h>
+#include <linux/bpf_common.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+
+const struct sock_filter syscall_filter[] = {
+	BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, arch))),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, AUDIT_ARCH_AARCH64, 1, 0),
+	BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS),
+	BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 33, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
+};
+
+const size_t syscall_filter_length = sizeof (syscall_filter) / sizeof (struct sock_filter);
+
+/* THIS FILE WAS GENERATED BY arch/aarch64/gen-syscall.bash -- DO NOT EDIT */
diff --git a/arch/aarch64/syscall.h b/arch/aarch64/syscall.h
new file mode 100644
index 0000000..0d4edf0
--- /dev/null
+++ b/arch/aarch64/syscall.h
@@ -0,0 +1,10 @@
+/* THIS FILE WAS GENERATED BY arch/aarch64/gen-syscall.bash -- DO NOT EDIT */
+
+extern const struct sock_filter syscall_filter[];
+extern const size_t syscall_filter_length;
+
+#define BST_NR_mknodat 33
+
+#define BST_NR_MAX 33
+
+/* THIS FILE WAS GENERATED BY arch/aarch64/gen-syscall.bash -- DO NOT EDIT */

From 93bc506458630b82691970ff48cc2d7d78adff51 Mon Sep 17 00:00:00 2001
From: "Franklin \"Snaipe\" Mathieu" <snaipe@arista.com>
Date: Tue, 12 Nov 2024 11:26:41 +0100
Subject: [PATCH 3/3] stat,seccomp: fix 32-bit overflow on stat quantities

On most i686 distributions, glibc implements stat() for programs compiled
without -D_FILE_OFFSET_BITS=64 by calling the corresponding stat64 system
call, and if any of the 64-bit quantities in the statbuf are larger than
2^32-1, the glibc wrapper pretends the file does not exist by returning
ENOENT.

This flag mitigates the issue by pulling the rug under glibc and rewriting
the quantities to stay within bounds. For timestamps, a fixed date within
range is used. For inode numbers, the value is rewritten in a way that
keeps the (device, inode) pair unique.
---
 arch/x86/gen-syscall.bash |   6 +
 arch/x86/syscall.c        |  10 ++
 arch/x86/syscall.h        |   7 +-
 main.c                    |  14 ++
 man/bst.1.scd             |  17 +++
 sec.c                     | 271 ++++++++++++++++++++++++++++++++++++++
 sec.h                     |   2 +
 7 files changed, 326 insertions(+), 1 deletion(-)

diff --git a/arch/x86/gen-syscall.bash b/arch/x86/gen-syscall.bash
index 3502ed9..05a7e5c 100755
--- a/arch/x86/gen-syscall.bash
+++ b/arch/x86/gen-syscall.bash
@@ -13,6 +13,12 @@ declare -A x86_64_syscalls=(
 declare -A i386_syscalls=(
 	["mknod"]="14"
 	["mknodat"]="297"
+
+	["stat64"]="195"
+	["lstat64"]="196"
+	["fstat64"]="197"
+	["fstatat64"]="300"
+	["statx"]="383"
 )
 
 prelude=(
diff --git a/arch/x86/syscall.c b/arch/x86/syscall.c
index fc426c9..32d1dbb 100644
--- a/arch/x86/syscall.c
+++ b/arch/x86/syscall.c
@@ -23,8 +23,18 @@ const struct sock_filter syscall_filter[] = {
 	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
 	BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
 	BPF_STMT(BPF_LD | BPF_W | BPF_ABS, (offsetof(struct seccomp_data, nr))),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 196, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 195, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 197, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
 	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 14, 0, 1),
 	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 383, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
+	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 300, 0, 1),
+	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
 	BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, 297, 0, 1),
 	BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_USER_NOTIF),
 	BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
diff --git a/arch/x86/syscall.h b/arch/x86/syscall.h
index 8ed4914..250c60e 100644
--- a/arch/x86/syscall.h
+++ b/arch/x86/syscall.h
@@ -5,12 +5,17 @@ extern const size_t syscall_filter_length;
 
 #define BST_NR_mknod 133
 #define BST_NR_mknodat 259
+#define BST_NR_lstat64_32 196
+#define BST_NR_stat64_32 195
+#define BST_NR_fstat64_32 197
 #define BST_NR_mknod_32 14
+#define BST_NR_statx_32 383
+#define BST_NR_fstatat64_32 300
 #define BST_NR_mknodat_32 297
 
 #define BST_SECCOMP_32 1
 
 #define BST_NR_MAX 259
-#define BST_NR_MAX32 297
+#define BST_NR_MAX32 383
 
 /* THIS FILE WAS GENERATED BY arch/x86/gen-syscall.bash -- DO NOT EDIT */
diff --git a/main.c b/main.c
index f1fc7ab..35973db 100644
--- a/main.c
+++ b/main.c
@@ -29,6 +29,7 @@
 #include "util.h"
 #include "path.h"
 #include "util.h"
+#include "sec.h"
 
 enum {
 	OPTION_VERSION = 128,
@@ -63,6 +64,10 @@ enum {
 	OPTION_CLOSE_FD,
 	OPTION_CGROUP_DRIVER,
 
+	/* Opt-in feature flags */
+	OPTION_FIX_STAT_32BIT_OVERFLOW,
+
+	/* Opt-out feature flags */
 	OPTION_NO_FAKE_DEVTMPFS,
 	OPTION_NO_DERANDOMIZE,
 	OPTION_NO_PROC_REMOUNT,
@@ -316,6 +321,9 @@ int main(int argc, char *argv[], char *envp[])
 		{ "close-fd",           optional_argument, NULL, OPTION_CLOSE_FD        },
 		{ "cgroup-driver",      required_argument, NULL, OPTION_CGROUP_DRIVER   },
 
+		/* Opt-in feature flags */
+		{ "fix-stat-32bit-overflow", no_argument, NULL, OPTION_FIX_STAT_32BIT_OVERFLOW },
+
 		/* Opt-out feature flags */
 		{ "no-copy-hard-rlimits", no_argument, NULL, OPTION_NO_COPY_HARD_RLIMITS },
 		{ "no-fake-devtmpfs",     no_argument, NULL, OPTION_NO_FAKE_DEVTMPFS     },
@@ -781,6 +789,12 @@ int main(int argc, char *argv[], char *envp[])
 				break;
 			}
 
+			case OPTION_FIX_STAT_32BIT_OVERFLOW:
+			{
+				sec_seccomp_fix_stat_32bit = 1;
+				break;
+			}
+
 			case 'r':
 				opts.root = optarg;
 				break;
diff --git a/man/bst.1.scd b/man/bst.1.scd
index 6195402..38541ad 100644
--- a/man/bst.1.scd
+++ b/man/bst.1.scd
@@ -390,6 +390,23 @@ spacetime process.
 	be useful to pass out-of-band data to the setup program without leaking
 	file descriptors to the spacetime process.
 
+\--fix-stat-32bit-overflow
+	Hijack calls to the stat64 family of system calls and return quantities
+	within 32-bit boundaries.
+
+	On most i686 distributions, glibc implements stat() for programs compiled
+	without -D_FILE_OFFSET_BITS=64 by calling the corresponding stat64 system
+	call, and if any of the 64-bit quantities in the statbuf are larger than
+	2^32-1, the glibc wrapper pretends the file does not exist by returning
+	ENOENT.
+
+	This flag mitigates the issue by pulling the rug under glibc and rewriting
+	the quantities to stay within bounds. For timestamps, a fixed date within
+	range is used. For inode numbers, the value is rewritten in a way that
+	keeps the (device, inode) pair unique.
+
+	This flag has no effect on programs running with a 64-bit personality.
+
 \--no-copy-hard-rlimits
 	Do not copy hard limit values to soft limits for all resources mentioned above.
 
diff --git a/sec.c b/sec.c
index 2d394ca..100838f 100644
--- a/sec.c
+++ b/sec.c
@@ -29,6 +29,8 @@
 #include "sec.h"
 #include "util.h"
 
+int sec_seccomp_fix_stat_32bit = 0;
+
 typedef int syscall_handler_func(int, int, struct seccomp_notif *);
 
 enum {
@@ -308,6 +310,259 @@ static int sec__mknodat(int seccomp_fd, int procfd, struct seccomp_notif *req)
 	return sec__mknodat_impl(seccomp_fd, procfd, req, dirfd, pathnameaddr, mode, dev);
 }
 
+struct statx_args {
+	int dirfd;
+	char pathname[PATH_MAX];
+	int flags;
+	unsigned int mask;
+	struct statx statxbuf;
+};
+
+static int do_statx(int dirfd, char *pathname, int flags, unsigned int mask, struct statx *statxbuf)
+{
+	/* We always mock timestamps, so no need to query them. */
+	mask &= ~(STATX_ATIME | STATX_BTIME | STATX_MTIME | STATX_CTIME);
+
+	if (statx(dirfd, pathname, flags, mask, statxbuf) == -1) {
+		return -errno;
+	}
+
+	/* Normalize the timestamps to a fixed 32-bit date. */
+	struct statx_timestamp well_known_date = {
+		.tv_sec = 946728000, /* 2000-01-01 12:00:00 +0000 UTC */
+	};
+
+	statxbuf->stx_atime = well_known_date;
+	statxbuf->stx_btime = well_known_date;
+	statxbuf->stx_mtime = well_known_date;
+	statxbuf->stx_ctime = well_known_date;
+
+	/* Normalize the inode so that it fits in 32-bit space.
+	   There's no good way to solve this perfectly, but a reasonable compromise
+	   that keeps the (dev, ino) pair unique is to move the upper 32-bits into
+	   st_dev. On the 32-bit stat struct however, st_dev is also 32-bit wide,
+	   which means we have to split the upper and lower 16 bits of the upper
+	   32-bits of stx_ino into the minor and major numbers of st_dev
+	   respectively.
+	  */
+	const uint32_t prime32 = 3432918353;
+	const uint16_t prime16 = 62533;
+
+	if (statxbuf->stx_ino > UINT32_MAX) {
+		uint32_t major, minor;
+		minor  = (uint32_t)statxbuf->stx_dev_minor * prime32;
+		minor ^= ((statxbuf->stx_ino >> 48) & 0xffff);
+		statxbuf->stx_dev_minor = minor;
+		major  = (uint32_t)statxbuf->stx_dev_major * prime32;
+		major ^= ((statxbuf->stx_ino >> 32) & 0xffff);
+		statxbuf->stx_dev_major = major;
+		statxbuf->stx_ino &= 0xffffffff;
+	}
+	if (statxbuf->stx_dev_major > UINT16_MAX) {
+		uint16_t major;
+		major  = (uint16_t)statxbuf->stx_dev_major * prime16;
+		major ^= (uint16_t)(statxbuf->stx_dev_major >> 16);
+		statxbuf->stx_dev_major = major;
+	}
+	if (statxbuf->stx_dev_minor > UINT16_MAX) {
+		uint16_t minor;
+		minor  = (uint16_t)statxbuf->stx_dev_minor * prime16;
+		minor ^= (uint16_t)(statxbuf->stx_dev_minor >> 16);
+		statxbuf->stx_dev_minor = minor;
+	}
+	return 0;
+}
+
+static int sec__statx_callback(int procfd, void *cookie)
+{
+	struct statx_args *args = cookie;
+	return do_statx(args->dirfd, args->pathname, args->flags, args->mask, &args->statxbuf);
+}
+
+static int sec__statx(int seccomp_fd, int procfd, struct seccomp_notif *req)
+{
+	int dirfd = req->data.args[0];
+	uintptr_t pathnameaddr = req->data.args[1];
+	int flags = req->data.args[2];
+	unsigned int mask = req->data.args[3];
+	uintptr_t statxbufaddr = req->data.args[4];
+
+	int realdirfd = resolve_dirfd(procfd, dirfd);
+	if (realdirfd < 0) {
+		return realdirfd;
+	}
+
+	struct statx_args args = {
+		.dirfd = realdirfd,
+		.flags = flags,
+		.mask = mask,
+	};
+
+	struct arg_buf in[] = {
+		{
+			.addr = pathnameaddr,
+			.buf  = &args.pathname[0],
+			.size = PATH_MAX-1,
+		},
+		{
+			.addr = 0,
+		},
+	};
+
+	struct arg_buf out[] = {
+		{
+			.addr = statxbufaddr,
+			.buf  = (char *)&args.statxbuf,
+			.size = sizeof (struct statx),
+		},
+		{
+			.addr = 0,
+		},
+	};
+
+	int rc = run_in_process_context(seccomp_fd, procfd, req, in, out, &args, sec__statx_callback);
+
+	close(realdirfd);
+	return rc;
+}
+
+struct sec__stat64 {
+	uint64_t dev;
+	uint64_t ino;
+	uint64_t nlink;
+
+	uint32_t mode;
+	uint32_t uid;
+	uint32_t gid;
+	uint32_t __pad0;
+	uint64_t rdev;
+	int64_t size;
+	int64_t blksize;
+	int64_t blocks;
+
+	uint64_t atime;
+	uint64_t atime_nsec;
+	uint64_t mtime;
+	uint64_t mtime_nsec;
+	uint64_t ctime;
+	uint64_t ctime_nsec;
+	int64_t __unused[3];
+};
+
+struct fstatat64_args {
+	int dirfd;
+	char pathname[PATH_MAX];
+	int flags;
+	unsigned int mask;
+	struct sec__stat64 statbuf;
+};
+
+static inline uint64_t makedev64(uint32_t major, uint32_t minor)
+{
+	/* We can't use makedev() since it's bit-dependent */
+	uint64_t dev;
+	dev  = (((dev_t) (major & 0x00000fffu)) <<  8);
+	dev |= (((dev_t) (major & 0xfffff000u)) << 32);
+	dev |= (((dev_t) (minor & 0x000000ffu)) <<  0);
+	dev |= (((dev_t) (minor & 0xffffff00u)) << 12);
+	return dev;
+}
+
+static int sec__fstatat64_callback(int procfd, void *cookie)
+{
+	struct fstatat64_args *args = cookie;
+	struct statx statxbuf;
+
+	int rc = do_statx(args->dirfd, args->pathname, args->flags, STATX_BASIC_STATS, &statxbuf);
+	if (rc < 0) {
+		return rc;
+	}
+
+	args->statbuf.dev = makedev64(statxbuf.stx_dev_major, statxbuf.stx_dev_minor);
+	args->statbuf.ino = statxbuf.stx_ino;
+	args->statbuf.nlink = statxbuf.stx_nlink;
+	args->statbuf.mode = statxbuf.stx_mode;
+	args->statbuf.uid = statxbuf.stx_uid;
+	args->statbuf.gid = statxbuf.stx_gid;
+	args->statbuf.rdev = makedev64(statxbuf.stx_rdev_major, statxbuf.stx_rdev_minor);
+	args->statbuf.size = statxbuf.stx_size;
+	args->statbuf.blksize = statxbuf.stx_blksize;
+	args->statbuf.blocks = statxbuf.stx_blocks;
+	args->statbuf.atime = statxbuf.stx_atime.tv_sec;
+	args->statbuf.atime_nsec = statxbuf.stx_atime.tv_nsec;
+	args->statbuf.mtime = statxbuf.stx_mtime.tv_sec;
+	args->statbuf.mtime_nsec = statxbuf.stx_mtime.tv_nsec;
+	args->statbuf.ctime = statxbuf.stx_ctime.tv_sec;
+	args->statbuf.ctime_nsec = statxbuf.stx_ctime.tv_nsec;
+
+	return 0;
+}
+
+static int sec__fstatat64_impl(int seccomp_fd, int procfd,
+		struct seccomp_notif *req,
+		int dirfd,
+		uintptr_t pathnameaddr,
+		uintptr_t statbufaddr,
+		int flags)
+{
+	int realdirfd = resolve_dirfd(procfd, dirfd);
+	if (realdirfd < 0) {
+		return realdirfd;
+	}
+
+	struct fstatat64_args args = {
+		.dirfd = realdirfd,
+		.flags = flags,
+	};
+
+	struct arg_buf in[] = {
+		{
+			.addr = pathnameaddr,
+			.buf  = &args.pathname[0],
+			.size = PATH_MAX-1,
+		},
+		{
+			.addr = 0,
+		},
+	};
+
+	struct arg_buf out[] = {
+		{
+			.addr = statbufaddr,
+			.buf  = (char *)&args.statbuf,
+			.size = sizeof (struct sec__stat64),
+		},
+		{
+			.addr = 0,
+		},
+	};
+
+	int rc = run_in_process_context(seccomp_fd, procfd, req, in, out, &args, sec__fstatat64_callback);
+
+	close(realdirfd);
+	return rc;
+}
+
+static int sec__stat64(int seccomp_fd, int procfd, struct seccomp_notif *req)
+{
+	return sec__fstatat64_impl(seccomp_fd, procfd, req, AT_FDCWD, req->data.args[0], req->data.args[1], 0);
+}
+
+static int sec__lstat64(int seccomp_fd, int procfd, struct seccomp_notif *req)
+{
+	return sec__fstatat64_impl(seccomp_fd, procfd, req, AT_FDCWD, req->data.args[0], req->data.args[1], AT_SYMLINK_NOFOLLOW);
+}
+
+static int sec__fstat64(int seccomp_fd, int procfd, struct seccomp_notif *req)
+{
+	return sec__fstatat64_impl(seccomp_fd, procfd, req, req->data.args[0], 0, req->data.args[1], AT_EMPTY_PATH);
+}
+
+static int sec__fstatat64(int seccomp_fd, int procfd, struct seccomp_notif *req)
+{
+	return sec__fstatat64_impl(seccomp_fd, procfd, req, req->data.args[0], req->data.args[1], req->data.args[2], req->data.args[3]);
+}
+
 static int seccomp(unsigned int op, unsigned int flags, void *args)
 {
 	return syscall(__NR_seccomp, op, flags, args);
@@ -350,6 +605,22 @@ static void sec_seccomp_dispatch_syscall(int seccomp_fd,
 #endif
 		[BST_NR_mknodat_32] = sec__mknodat,
 	};
+
+	if (sec_seccomp_fix_stat_32bit) {
+#ifdef BST_NR_stat64_32
+		syscall_table_32[BST_NR_stat64_32] = sec__stat64;
+#endif
+#ifdef BST_NR_lstat64_32
+		syscall_table_32[BST_NR_lstat64_32] = sec__lstat64;
+#endif
+#ifdef BST_NR_fstat64_32
+		syscall_table_32[BST_NR_fstat64_32] = sec__fstat64;
+#endif
+#ifdef BST_NR_fstatat64_32
+		syscall_table_32[BST_NR_fstatat64_32] = sec__fstatat64;
+#endif
+		syscall_table_32[BST_NR_statx_32] = sec__statx;
+	}
 #endif
 
 	resp->id = req->id;
diff --git a/sec.h b/sec.h
index 1da2ce3..08b3bc9 100644
--- a/sec.h
+++ b/sec.h
@@ -12,4 +12,6 @@
 int sec_seccomp_install_filter(void);
 noreturn void sec_seccomp_supervisor(int);
 
+extern int sec_seccomp_fix_stat_32bit;
+
 #endif /* !SEC_H_ */