Commit e95a5f54 authored by David S. Miller's avatar David S. Miller

Merge branch 'bpfilter'

Alexei Starovoitov says:

====================
bpfilter

v2->v3:
- followed Luis's suggestion and significantly simplied first patch
  with shmem_kernel_file_setup+kernel_write. Added kdoc for new helper
- fixed typos and race to access pipes with mutex
- tested with bpfilter being 'builtin'. CONFIG_BPFILTER_UMH=y|m both work.
  Interesting to see a usermode executable being embedded inside vmlinux.
- it doesn't hurt to enable bpfilter in .config.
  ip_setsockopt commands sent to usermode via pipes and -ENOPROTOOPT is
  returned from userspace, so kernel falls back to original iptables code

v1->v2:
this patch set is almost a full rewrite of the earlier umh modules approach
The v1 of patches and follow up discussion was covered by LWN:
https://lwn.net/Articles/749108/

I believe the v2 addresses all issues brought up by Andy and others.
Mainly there are zero changes to kernel/module.c
Instead of teaching module loading logic to recognize special
umh module, let normal kernel modules execute part of its own
.init.rodata as a new user space process (Andy's idea)
Patch 1 introduces this new helper:
int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
Input:
  data + len == executable file
Output:
  struct umh_info {
       struct file *pipe_to_umh;
       struct file *pipe_from_umh;
       pid_t pid;
  };

Advantages vs v1:
- the embedded user mode executable is stored as .init.rodata inside
  normal kernel module. These pages are freed when .ko finishes loading
- the elf file is copied into tmpfs file. The user mode process is swappable.
- the communication between user mode process and 'parent' kernel module
  is done via two unix pipes, hence protocol is not exposed to
  user space
- impossible to launch umh on its own (that was the main issue of v1)
  and impossible to be man-in-the-middle due to pipes
- bpfilter.ko consists of tiny kernel part that passes the data
  between kernel and umh via pipes and much bigger umh part that
  doing all the work
- 'lsmod' shows bpfilter.ko as usual.
  'rmmod bpfilter' removes kernel module and kills corresponding umh
- signed bpfilter.ko covers the whole image including umh code

Few issues:
- the user can still attach to the process and debug it with
  'gdb /proc/pid/exe pid', but 'gdb -p pid' doesn't work.
  (a bit worse comparing to v1)
- tinyconfig will notice a small increase in .text
  +766 | TEXT | 7c8b94806bec umh: introduce fork_usermode_blob() helper
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 1fe8c06c d2ba09c1
......@@ -1706,14 +1706,13 @@ static int exec_binprm(struct linux_binprm *bprm)
/*
* sys_execve() executes a new program.
*/
static int do_execveat_common(int fd, struct filename *filename,
static int __do_execve_file(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags)
int flags, struct file *file)
{
char *pathbuf = NULL;
struct linux_binprm *bprm;
struct file *file;
struct files_struct *displaced;
int retval;
......@@ -1752,6 +1751,7 @@ static int do_execveat_common(int fd, struct filename *filename,
check_unsafe_exec(bprm);
current->in_execve = 1;
if (!file)
file = do_open_execat(fd, filename, flags);
retval = PTR_ERR(file);
if (IS_ERR(file))
......@@ -1760,7 +1760,9 @@ static int do_execveat_common(int fd, struct filename *filename,
sched_exec();
bprm->file = file;
if (fd == AT_FDCWD || filename->name[0] == '/') {
if (!filename) {
bprm->filename = "none";
} else if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
} else {
if (filename->name[0] == '\0')
......@@ -1826,6 +1828,7 @@ static int do_execveat_common(int fd, struct filename *filename,
task_numa_free(current);
free_bprm(bprm);
kfree(pathbuf);
if (filename)
putname(filename);
if (displaced)
put_files_struct(displaced);
......@@ -1849,10 +1852,27 @@ static int do_execveat_common(int fd, struct filename *filename,
if (displaced)
reset_files_struct(displaced);
out_ret:
if (filename)
putname(filename);
return retval;
}
static int do_execveat_common(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
int flags)
{
return __do_execve_file(fd, filename, argv, envp, flags, NULL);
}
int do_execve_file(struct file *file, void *__argv, void *__envp)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
return __do_execve_file(AT_FDCWD, NULL, argv, envp, 0, file);
}
int do_execve(struct filename *filename,
const char __user *const __user *__argv,
const char __user *const __user *__envp)
......
......@@ -150,5 +150,6 @@ extern int do_execveat(int, struct filename *,
const char __user * const __user *,
const char __user * const __user *,
int);
int do_execve_file(struct file *file, void *__argv, void *__envp);
#endif /* _LINUX_BINFMTS_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_BPFILTER_H
#define _LINUX_BPFILTER_H
#include <uapi/linux/bpfilter.h>
struct sock;
int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char *optval,
unsigned int optlen);
int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char *optval,
int *optlen);
extern int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
char __user *optval,
unsigned int optlen, bool is_set);
#endif
......@@ -22,8 +22,10 @@ struct subprocess_info {
const char *path;
char **argv;
char **envp;
struct file *file;
int wait;
int retval;
pid_t pid;
int (*init)(struct subprocess_info *info, struct cred *new);
void (*cleanup)(struct subprocess_info *info);
void *data;
......@@ -38,6 +40,16 @@ call_usermodehelper_setup(const char *path, char **argv, char **envp,
int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *), void *data);
struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *), void *data);
struct umh_info {
struct file *pipe_to_umh;
struct file *pipe_from_umh;
pid_t pid;
};
int fork_usermode_blob(void *data, size_t len, struct umh_info *info);
extern int
call_usermodehelper_exec(struct subprocess_info *info, int wait);
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _UAPI_LINUX_BPFILTER_H
#define _UAPI_LINUX_BPFILTER_H
#include <linux/if.h>
enum {
BPFILTER_IPT_SO_SET_REPLACE = 64,
BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65,
BPFILTER_IPT_SET_MAX,
};
enum {
BPFILTER_IPT_SO_GET_INFO = 64,
BPFILTER_IPT_SO_GET_ENTRIES = 65,
BPFILTER_IPT_SO_GET_REVISION_MATCH = 66,
BPFILTER_IPT_SO_GET_REVISION_TARGET = 67,
BPFILTER_IPT_GET_MAX,
};
#endif /* _UAPI_LINUX_BPFILTER_H */
......@@ -25,6 +25,8 @@
#include <linux/ptrace.h>
#include <linux/async.h>
#include <linux/uaccess.h>
#include <linux/shmem_fs.h>
#include <linux/pipe_fs_i.h>
#include <trace/events/module.h>
......@@ -97,6 +99,10 @@ static int call_usermodehelper_exec_async(void *data)
commit_creds(new);
if (sub_info->file)
retval = do_execve_file(sub_info->file,
sub_info->argv, sub_info->envp);
else
retval = do_execve(getname_kernel(sub_info->path),
(const char __user *const __user *)sub_info->argv,
(const char __user *const __user *)sub_info->envp);
......@@ -185,6 +191,8 @@ static void call_usermodehelper_exec_work(struct work_struct *work)
if (pid < 0) {
sub_info->retval = pid;
umh_complete(sub_info);
} else {
sub_info->pid = pid;
}
}
}
......@@ -393,6 +401,117 @@ struct subprocess_info *call_usermodehelper_setup(const char *path, char **argv,
}
EXPORT_SYMBOL(call_usermodehelper_setup);
struct subprocess_info *call_usermodehelper_setup_file(struct file *file,
int (*init)(struct subprocess_info *info, struct cred *new),
void (*cleanup)(struct subprocess_info *info), void *data)
{
struct subprocess_info *sub_info;
sub_info = kzalloc(sizeof(struct subprocess_info), GFP_KERNEL);
if (!sub_info)
return NULL;
INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
sub_info->path = "none";
sub_info->file = file;
sub_info->init = init;
sub_info->cleanup = cleanup;
sub_info->data = data;
return sub_info;
}
static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
{
struct umh_info *umh_info = info->data;
struct file *from_umh[2];
struct file *to_umh[2];
int err;
/* create pipe to send data to umh */
err = create_pipe_files(to_umh, 0);
if (err)
return err;
err = replace_fd(0, to_umh[0], 0);
fput(to_umh[0]);
if (err < 0) {
fput(to_umh[1]);
return err;
}
/* create pipe to receive data from umh */
err = create_pipe_files(from_umh, 0);
if (err) {
fput(to_umh[1]);
replace_fd(0, NULL, 0);
return err;
}
err = replace_fd(1, from_umh[1], 0);
fput(from_umh[1]);
if (err < 0) {
fput(to_umh[1]);
replace_fd(0, NULL, 0);
fput(from_umh[0]);
return err;
}
umh_info->pipe_to_umh = to_umh[1];
umh_info->pipe_from_umh = from_umh[0];
return 0;
}
static void umh_save_pid(struct subprocess_info *info)
{
struct umh_info *umh_info = info->data;
umh_info->pid = info->pid;
}
/**
* fork_usermode_blob - fork a blob of bytes as a usermode process
* @data: a blob of bytes that can be do_execv-ed as a file
* @len: length of the blob
* @info: information about usermode process (shouldn't be NULL)
*
* Returns either negative error or zero which indicates success
* in executing a blob of bytes as a usermode process. In such
* case 'struct umh_info *info' is populated with two pipes
* and a pid of the process. The caller is responsible for health
* check of the user process, killing it via pid, and closing the
* pipes when user process is no longer needed.
*/
int fork_usermode_blob(void *data, size_t len, struct umh_info *info)
{
struct subprocess_info *sub_info;
struct file *file;
ssize_t written;
loff_t pos = 0;
int err;
file = shmem_kernel_file_setup("", len, 0);
if (IS_ERR(file))
return PTR_ERR(file);
written = kernel_write(file, data, len, &pos);
if (written != len) {
err = written;
if (err >= 0)
err = -ENOMEM;
goto out;
}
err = -ENOMEM;
sub_info = call_usermodehelper_setup_file(file, umh_pipe_setup,
umh_save_pid, info);
if (!sub_info)
goto out;
err = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
out:
fput(file);
return err;
}
EXPORT_SYMBOL_GPL(fork_usermode_blob);
/**
* call_usermodehelper_exec - start a usermode application
* @sub_info: information about the subprocessa
......
......@@ -202,6 +202,8 @@ source "net/bridge/netfilter/Kconfig"
endif
source "net/bpfilter/Kconfig"
source "net/dccp/Kconfig"
source "net/sctp/Kconfig"
source "net/rds/Kconfig"
......
......@@ -20,6 +20,7 @@ obj-$(CONFIG_TLS) += tls/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
obj-$(CONFIG_NET) += ipv6/
obj-$(CONFIG_BPFILTER) += bpfilter/
obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/
obj-$(CONFIG_BRIDGE) += bridge/
......
menuconfig BPFILTER
bool "BPF based packet filtering framework (BPFILTER)"
default n
depends on NET && BPF
help
This builds experimental bpfilter framework that is aiming to
provide netfilter compatible functionality via BPF
if BPFILTER
config BPFILTER_UMH
tristate "bpfilter kernel module with user mode helper"
default m
help
This builds bpfilter kernel module with embedded user mode helper
endif
# SPDX-License-Identifier: GPL-2.0
#
# Makefile for the Linux BPFILTER layer.
#
hostprogs-y := bpfilter_umh
bpfilter_umh-objs := main.o
HOSTCFLAGS += -I. -Itools/include/
ifeq ($(CONFIG_BPFILTER_UMH), y)
# builtin bpfilter_umh should be compiled with -static
# since rootfs isn't mounted at the time of __init
# function is called and do_execv won't find elf interpreter
HOSTLDFLAGS += -static
endif
# a bit of elf magic to convert bpfilter_umh binary into a binary blob
# inside bpfilter_umh.o elf file referenced by
# _binary_net_bpfilter_bpfilter_umh_start symbol
# which bpfilter_kern.c passes further into umh blob loader at run-time
quiet_cmd_copy_umh = GEN $@
cmd_copy_umh = echo ':' > $(obj)/.bpfilter_umh.o.cmd; \
$(OBJCOPY) -I binary -O $(CONFIG_OUTPUT_FORMAT) \
-B `$(OBJDUMP) -f $<|grep architecture|cut -d, -f1|cut -d' ' -f2` \
--rename-section .data=.init.rodata $< $@
$(obj)/bpfilter_umh.o: $(obj)/bpfilter_umh
$(call cmd,copy_umh)
obj-$(CONFIG_BPFILTER_UMH) += bpfilter.o
bpfilter-objs += bpfilter_kern.o bpfilter_umh.o
// SPDX-License-Identifier: GPL-2.0
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/init.h>
#include <linux/module.h>
#include <linux/umh.h>
#include <linux/bpfilter.h>
#include <linux/sched.h>
#include <linux/sched/signal.h>
#include <linux/fs.h>
#include <linux/file.h>
#include "msgfmt.h"
#define UMH_start _binary_net_bpfilter_bpfilter_umh_start
#define UMH_end _binary_net_bpfilter_bpfilter_umh_end
extern char UMH_start;
extern char UMH_end;
static struct umh_info info;
/* since ip_getsockopt() can run in parallel, serialize access to umh */
static DEFINE_MUTEX(bpfilter_lock);
static void shutdown_umh(struct umh_info *info)
{
struct task_struct *tsk;
tsk = pid_task(find_vpid(info->pid), PIDTYPE_PID);
if (tsk)
force_sig(SIGKILL, tsk);
fput(info->pipe_to_umh);
fput(info->pipe_from_umh);
}
static void __stop_umh(void)
{
if (bpfilter_process_sockopt) {
bpfilter_process_sockopt = NULL;
shutdown_umh(&info);
}
}
static void stop_umh(void)
{
mutex_lock(&bpfilter_lock);
__stop_umh();
mutex_unlock(&bpfilter_lock);
}
static int __bpfilter_process_sockopt(struct sock *sk, int optname,
char __user *optval,
unsigned int optlen, bool is_set)
{
struct mbox_request req;
struct mbox_reply reply;
loff_t pos;
ssize_t n;
int ret;
req.is_set = is_set;
req.pid = current->pid;
req.cmd = optname;
req.addr = (long)optval;
req.len = optlen;
mutex_lock(&bpfilter_lock);
n = __kernel_write(info.pipe_to_umh, &req, sizeof(req), &pos);
if (n != sizeof(req)) {
pr_err("write fail %zd\n", n);
__stop_umh();
ret = -EFAULT;
goto out;
}
pos = 0;
n = kernel_read(info.pipe_from_umh, &reply, sizeof(reply), &pos);
if (n != sizeof(reply)) {
pr_err("read fail %zd\n", n);
__stop_umh();
ret = -EFAULT;
goto out;
}
ret = reply.status;
out:
mutex_unlock(&bpfilter_lock);
return ret;
}
static int __init load_umh(void)
{
int err;
/* fork usermode process */
err = fork_usermode_blob(&UMH_start, &UMH_end - &UMH_start, &info);
if (err)
return err;
pr_info("Loaded bpfilter_umh pid %d\n", info.pid);
/* health check that usermode process started correctly */
if (__bpfilter_process_sockopt(NULL, 0, 0, 0, 0) != 0) {
stop_umh();
return -EFAULT;
}
bpfilter_process_sockopt = &__bpfilter_process_sockopt;
return 0;
}
static void __exit fini_umh(void)
{
stop_umh();
}
module_init(load_umh);
module_exit(fini_umh);
MODULE_LICENSE("GPL");
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <sys/uio.h>
#include <errno.h>
#include <stdio.h>
#include <sys/socket.h>
#include <fcntl.h>
#include <unistd.h>
#include "include/uapi/linux/bpf.h"
#include <asm/unistd.h>
#include "msgfmt.h"
int debug_fd;
static int handle_get_cmd(struct mbox_request *cmd)
{
switch (cmd->cmd) {
case 0:
return 0;
default:
break;
}
return -ENOPROTOOPT;
}
static int handle_set_cmd(struct mbox_request *cmd)
{
return -ENOPROTOOPT;
}
static void loop(void)
{
while (1) {
struct mbox_request req;
struct mbox_reply reply;
int n;
n = read(0, &req, sizeof(req));
if (n != sizeof(req)) {
dprintf(debug_fd, "invalid request %d\n", n);
return;
}
reply.status = req.is_set ?
handle_set_cmd(&req) :
handle_get_cmd(&req);
n = write(1, &reply, sizeof(reply));
if (n != sizeof(reply)) {
dprintf(debug_fd, "reply failed %d\n", n);
return;
}
}
}
int main(void)
{
debug_fd = open("/dev/console", 00000002 | 00000100);
dprintf(debug_fd, "Started bpfilter\n");
loop();
close(debug_fd);
return 0;
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _NET_BPFILTER_MSGFMT_H
#define _NET_BPFILTER_MSGFMT_H
struct mbox_request {
__u64 addr;
__u32 len;
__u32 is_set;
__u32 cmd;
__u32 pid;
};
struct mbox_reply {
__u32 status;
};
#endif
......@@ -16,6 +16,8 @@ obj-y := route.o inetpeer.o protocol.o \
inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
metrics.o
obj-$(CONFIG_BPFILTER) += bpfilter/
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
obj-$(CONFIG_PROC_FS) += proc.o
......
obj-$(CONFIG_BPFILTER) += sockopt.o
// SPDX-License-Identifier: GPL-2.0
#include <linux/uaccess.h>
#include <linux/bpfilter.h>
#include <uapi/linux/bpf.h>
#include <linux/wait.h>
#include <linux/kmod.h>
int (*bpfilter_process_sockopt)(struct sock *sk, int optname,
char __user *optval,
unsigned int optlen, bool is_set);
EXPORT_SYMBOL_GPL(bpfilter_process_sockopt);
int bpfilter_mbox_request(struct sock *sk, int optname, char __user *optval,
unsigned int optlen, bool is_set)
{
if (!bpfilter_process_sockopt) {
int err = request_module("bpfilter");
if (err)
return err;
if (!bpfilter_process_sockopt)
return -ECHILD;
}
return bpfilter_process_sockopt(sk, optname, optval, optlen, is_set);
}
int bpfilter_ip_set_sockopt(struct sock *sk, int optname, char __user *optval,
unsigned int optlen)
{
return bpfilter_mbox_request(sk, optname, optval, optlen, true);
}
int bpfilter_ip_get_sockopt(struct sock *sk, int optname, char __user *optval,
int __user *optlen)
{
int len;
if (get_user(len, optlen))
return -EFAULT;
return bpfilter_mbox_request(sk, optname, optval, len, false);
}
......@@ -47,6 +47,8 @@
#include <linux/errqueue.h>
#include <linux/uaccess.h>
#include <linux/bpfilter.h>
/*
* SOL_IP control messages.
*/
......@@ -1244,6 +1246,11 @@ int ip_setsockopt(struct sock *sk, int level,
return -ENOPROTOOPT;
err = do_ip_setsockopt(sk, level, optname, optval, optlen);
#ifdef CONFIG_BPFILTER
if (optname >= BPFILTER_IPT_SO_SET_REPLACE &&
optname < BPFILTER_IPT_SET_MAX)
err = bpfilter_ip_set_sockopt(sk, optname, optval, optlen);
#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_HDRINCL &&
......@@ -1552,6 +1559,11 @@ int ip_getsockopt(struct sock *sk, int level,
int err;
err = do_ip_getsockopt(sk, level, optname, optval, optlen, 0);
#ifdef CONFIG_BPFILTER
if (optname >= BPFILTER_IPT_SO_GET_INFO &&
optname < BPFILTER_IPT_GET_MAX)
err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
......@@ -1584,6 +1596,11 @@ int compat_ip_getsockopt(struct sock *sk, int level, int optname,
err = do_ip_getsockopt(sk, level, optname, optval, optlen,
MSG_CMSG_COMPAT);
#ifdef CONFIG_BPFILTER
if (optname >= BPFILTER_IPT_SO_GET_INFO &&
optname < BPFILTER_IPT_GET_MAX)
err = bpfilter_ip_get_sockopt(sk, optname, optval, optlen);
#endif
#ifdef CONFIG_NETFILTER
/* we need to exclude all possible ENOPROTOOPTs except default case */
if (err == -ENOPROTOOPT && optname != IP_PKTOPTIONS &&
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment