Commit e6ca4f16 authored by David S. Miller's avatar David S. Miller

Merge branch 'bpf-lru'

Martin KaFai Lau says:

====================
bpf: LRU map

This patch set adds LRU map implementation to the existing BPF map
family.

The first few patches introduce the basic BPF LRU list
implementation.

The later patches introduce the LRU versions of the
existing BPF_MAP_TYPE_LRU_[PERCPU_]HASH maps by leveraging
the BPF LRU list.

v2:
- Added a percpu LRU list option which can be specified as
  a map attribute.

  [Note: percpu LRU list has nothing to do with the map's value]

- Removed the cpu variable from the struct bpf_lru_locallist
  since it is not needed.

- Changed the __bpf_lru_node_move_out to __bpf_lru_node_move_to_free in
  patch 1 to prepare the percpu LRU list in patch 2.

- Moved the test_lru_map under selftests

- Refactored a few things in the test codes
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents bb598c1b 5db58faf
...@@ -85,6 +85,8 @@ enum bpf_map_type { ...@@ -85,6 +85,8 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_STACK_TRACE,
BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_CGROUP_ARRAY,
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
}; };
enum bpf_prog_type { enum bpf_prog_type {
...@@ -106,6 +108,13 @@ enum bpf_prog_type { ...@@ -106,6 +108,13 @@ enum bpf_prog_type {
#define BPF_EXIST 2 /* update existing element */ #define BPF_EXIST 2 /* update existing element */
#define BPF_F_NO_PREALLOC (1U << 0) #define BPF_F_NO_PREALLOC (1U << 0)
/* Instead of having one common LRU list in the
* BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
* which can scale and perform better.
* Note, the LRU nodes (including free nodes) cannot be moved
* across different LRU lists.
*/
#define BPF_F_NO_COMMON_LRU (1U << 1)
union bpf_attr { union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */ struct { /* anonymous struct used by BPF_MAP_CREATE command */
......
obj-y := core.o obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
ifeq ($(CONFIG_PERF_EVENTS),y) ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif endif
This diff is collapsed.
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
#include <linux/list.h>
#include <linux/spinlock_types.h>
#define NR_BPF_LRU_LIST_T (3)
#define NR_BPF_LRU_LIST_COUNT (2)
#define NR_BPF_LRU_LOCAL_LIST_T (2)
#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
enum bpf_lru_list_type {
BPF_LRU_LIST_T_ACTIVE,
BPF_LRU_LIST_T_INACTIVE,
BPF_LRU_LIST_T_FREE,
BPF_LRU_LOCAL_LIST_T_FREE,
BPF_LRU_LOCAL_LIST_T_PENDING,
};
struct bpf_lru_node {
struct list_head list;
u16 cpu;
u8 type;
u8 ref;
};
struct bpf_lru_list {
struct list_head lists[NR_BPF_LRU_LIST_T];
unsigned int counts[NR_BPF_LRU_LIST_COUNT];
/* The next inacitve list rotation starts from here */
struct list_head *next_inactive_rotation;
raw_spinlock_t lock ____cacheline_aligned_in_smp;
};
struct bpf_lru_locallist {
struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
u16 next_steal;
raw_spinlock_t lock;
};
struct bpf_common_lru {
struct bpf_lru_list lru_list;
struct bpf_lru_locallist __percpu *local_list;
};
typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
struct bpf_lru {
union {
struct bpf_common_lru common_lru;
struct bpf_lru_list __percpu *percpu_lru;
};
del_from_htab_func del_from_htab;
void *del_arg;
unsigned int hash_offset;
unsigned int nr_scans;
bool percpu;
};
static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
{
/* ref is an approximation on access frequency. It does not
* have to be very accurate. Hence, no protection is used.
*/
node->ref = 1;
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
del_from_htab_func del_from_htab, void *delete_arg);
void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
u32 elem_size, u32 nr_elems);
void bpf_lru_destroy(struct bpf_lru *lru);
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
#endif
This diff is collapsed.
...@@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr)
goto free_key; goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus(); value_size = round_up(map->value_size, 8) * num_possible_cpus();
else else
...@@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value) if (!value)
goto free_key; goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value); err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value); err = bpf_percpu_array_copy(map, key, value);
...@@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr)
goto free_key; goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus(); value_size = round_up(map->value_size, 8) * num_possible_cpus();
else else
...@@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr)
*/ */
preempt_disable(); preempt_disable();
__this_cpu_inc(bpf_prog_active); __this_cpu_inc(bpf_prog_active);
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, attr->flags); err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags); err = bpf_percpu_array_update(map, key, value, attr->flags);
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
obj- := dummy.o obj- := dummy.o
# List of programs to build # List of programs to build
hostprogs-y := test_lru_dist
hostprogs-y += sock_example hostprogs-y += sock_example
hostprogs-y += fds_example hostprogs-y += fds_example
hostprogs-y += sockex1 hostprogs-y += sockex1
...@@ -28,6 +29,7 @@ hostprogs-y += trace_event ...@@ -28,6 +29,7 @@ hostprogs-y += trace_event
hostprogs-y += sampleip hostprogs-y += sampleip
hostprogs-y += tc_l2_redirect hostprogs-y += tc_l2_redirect
test_lru_dist-objs := test_lru_dist.o libbpf.o
sock_example-objs := sock_example.o libbpf.o sock_example-objs := sock_example.o libbpf.o
fds_example-objs := bpf_load.o libbpf.o fds_example.o fds_example-objs := bpf_load.o libbpf.o fds_example.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
......
...@@ -19,6 +19,21 @@ struct bpf_map_def SEC("maps") hash_map = { ...@@ -19,6 +19,21 @@ struct bpf_map_def SEC("maps") hash_map = {
.max_entries = MAX_ENTRIES, .max_entries = MAX_ENTRIES,
}; };
struct bpf_map_def SEC("maps") lru_hash_map = {
.type = BPF_MAP_TYPE_LRU_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(long),
.max_entries = 10000,
};
struct bpf_map_def SEC("maps") percpu_lru_hash_map = {
.type = BPF_MAP_TYPE_LRU_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(long),
.max_entries = 10000,
.map_flags = BPF_F_NO_COMMON_LRU,
};
struct bpf_map_def SEC("maps") percpu_hash_map = { struct bpf_map_def SEC("maps") percpu_hash_map = {
.type = BPF_MAP_TYPE_PERCPU_HASH, .type = BPF_MAP_TYPE_PERCPU_HASH,
.key_size = sizeof(u32), .key_size = sizeof(u32),
...@@ -53,6 +68,7 @@ int stress_hmap(struct pt_regs *ctx) ...@@ -53,6 +68,7 @@ int stress_hmap(struct pt_regs *ctx)
value = bpf_map_lookup_elem(&hash_map, &key); value = bpf_map_lookup_elem(&hash_map, &key);
if (value) if (value)
bpf_map_delete_elem(&hash_map, &key); bpf_map_delete_elem(&hash_map, &key);
return 0; return 0;
} }
...@@ -96,5 +112,28 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) ...@@ -96,5 +112,28 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx)
bpf_map_delete_elem(&percpu_hash_map_alloc, &key); bpf_map_delete_elem(&percpu_hash_map_alloc, &key);
return 0; return 0;
} }
SEC("kprobe/sys_getpid")
int stress_lru_hmap_alloc(struct pt_regs *ctx)
{
u32 key = bpf_get_prandom_u32();
long val = 1;
bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
return 0;
}
SEC("kprobe/sys_getppid")
int stress_percpu_lru_hmap_alloc(struct pt_regs *ctx)
{
u32 key = bpf_get_prandom_u32();
long val = 1;
bpf_map_update_elem(&percpu_lru_hash_map, &key, &val, BPF_ANY);
return 0;
}
char _license[] SEC("license") = "GPL"; char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE; u32 _version SEC("version") = LINUX_VERSION_CODE;
...@@ -35,6 +35,8 @@ static __u64 time_get_ns(void) ...@@ -35,6 +35,8 @@ static __u64 time_get_ns(void)
#define PERCPU_HASH_PREALLOC (1 << 1) #define PERCPU_HASH_PREALLOC (1 << 1)
#define HASH_KMALLOC (1 << 2) #define HASH_KMALLOC (1 << 2)
#define PERCPU_HASH_KMALLOC (1 << 3) #define PERCPU_HASH_KMALLOC (1 << 3)
#define LRU_HASH_PREALLOC (1 << 4)
#define PERCPU_LRU_HASH_PREALLOC (1 << 5)
static int test_flags = ~0; static int test_flags = ~0;
...@@ -50,6 +52,30 @@ static void test_hash_prealloc(int cpu) ...@@ -50,6 +52,30 @@ static void test_hash_prealloc(int cpu)
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
} }
static void test_lru_hash_prealloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getpid);
printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
static void test_percpu_lru_hash_prealloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getppid);
printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
static void test_percpu_hash_prealloc(int cpu) static void test_percpu_hash_prealloc(int cpu)
{ {
__u64 start_time; __u64 start_time;
...@@ -105,6 +131,12 @@ static void loop(int cpu) ...@@ -105,6 +131,12 @@ static void loop(int cpu)
if (test_flags & PERCPU_HASH_KMALLOC) if (test_flags & PERCPU_HASH_KMALLOC)
test_percpu_hash_kmalloc(cpu); test_percpu_hash_kmalloc(cpu);
if (test_flags & LRU_HASH_PREALLOC)
test_lru_hash_prealloc(cpu);
if (test_flags & PERCPU_LRU_HASH_PREALLOC)
test_percpu_lru_hash_prealloc(cpu);
} }
static void run_perf_test(int tasks) static void run_perf_test(int tasks)
......
This diff is collapsed.
CFLAGS += -Wall -O2 CFLAGS += -Wall -O2 -I../../../../usr/include
test_objs = test_verifier test_maps test_objs = test_verifier test_maps test_lru_map
TEST_PROGS := test_verifier test_maps test_kmod.sh TEST_PROGS := test_verifier test_maps test_lru_map test_kmod.sh
TEST_FILES := $(test_objs) TEST_FILES := $(test_objs)
all: $(test_objs) all: $(test_objs)
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment