bpf: Using rcu_read_lock for bpf_sk_storage_map iterator

If a bucket contains a lot of sockets, during bpf_iter traversing a bucket, concurrent userspace bpf_map_update_elem() and bpf program bpf_sk_storage_{get,delete}() may experience some undesirable delays as they will compete with bpf_iter for bucket lock. Note that the number of buckets for bpf_sk_storage_map is roughly the same as the number of cpus. So if there are lots of sockets in the system, each bucket could contain lots of sockets. Different actual use cases may experience different delays. Here, using selftest bpf_iter subtest bpf_sk_storage_map, I hacked the kernel with ktime_get_mono_fast_ns() to collect the time when a bucket was locked during bpf_iter prog traversing that bucket. This way, the maximum incurred delay was measured w.r.t. the number of elements in a bucket. # elems in each bucket delay(ns) 64 17000 256 72512 2048 875246 The potential delays will be further increased if we have even more elemnts in a bucket. Using rcu_read_lock() is a reasonable compromise here. It may lose some precision, e.g., access stale sockets, but it will not hurt performance of bpf program or user space application which also tries to get/delete or update map elements. Signed-off-by: Yonghong Song <yhs@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Song Liu <songliubraving@fb.com> Cc: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20200916224645.720172-1-yhs@fb.com

bpf: Using rcu_read_lock for bpf_sk_storage_map iterator
If a bucket contains a lot of sockets, during bpf_iter traversing a bucket, concurrent userspace bpf_map_update_elem() and bpf program bpf_sk_storage_{get,delete}() may experience some undesirable delays as they will compete with bpf_iter for bucket lock. Note that the number of buckets for bpf_sk_storage_map is roughly the same as the number of cpus. So if there are lots of sockets in the system, each bucket could contain lots of sockets. Different actual use cases may experience different delays. Here, using selftest bpf_iter subtest bpf_sk_storage_map, I hacked the kernel with ktime_get_mono_fast_ns() to collect the time when a bucket was locked during bpf_iter prog traversing that bucket. This way, the maximum incurred delay was measured w.r.t. the number of elements in a bucket. # elems in each bucket delay(ns) 64 17000 256 72512 2048 875246 The potential delays will be further increased if we have even more elemnts in a bucket. Using rcu_read_lock() is a reasonable compromise here. It may lose some precision, e.g., access stale sockets, but it will not hurt performance of bpf program or user space application which also tries to get/delete or update map elements. Signed-off-by: Yonghong Song <yhs@fb.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Acked-by: Song Liu <songliubraving@fb.com> Cc: Martin KaFai Lau <kafai@fb.com> Link: https://lore.kernel.org/bpf/20200916224645.720172-1-yhs@fb.com
c69d2ddb · Yonghong Song · Alexei Starovoitov · 4153b89b · c69d2ddb
Commit c69d2ddb authored Sep 16, 2020 by Yonghong Song Committed by Alexei Starovoitov Sep 21, 2020
Show whitespace changes
Inline Side-by-side

Showing with 13 additions and 18 deletions

net/core/bpf_sk_storage.c net/core/bpf_sk_storage.c +13 -18

No files found.
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -674,6 +674,7 @@ struct bpf_iter_seq_sk_storage_map_info {
 static struct bpf_local_storage_elem *
 bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 				 struct bpf_local_storage_elem *prev_selem)
+	__acquires(RCU) __releases(RCU)
 {
 	struct bpf_local_storage *sk_storage;
 	struct bpf_local_storage_elem *selem;
@@ -692,16 +693,16 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 	selem = prev_selem;
 	count = 0;
 	while (selem) {
-		selem = hlist_entry_safe(selem->map_node.next,
+		selem = hlist_entry_safe(rcu_dereference(hlist_next_rcu(&selem->map_node)),
 					 struct bpf_local_storage_elem, map_node);
 		if (!selem) {
 			/* not found, unlock and go to the next bucket */
 			b = &smap->buckets[bucket_id++];
-			raw_spin_unlock_bh(&b->lock);
+			rcu_read_unlock();
 			skip_elems = 0;
 			break;
 		}
-		sk_storage = rcu_dereference_raw(selem->local_storage);
+		sk_storage = rcu_dereference(selem->local_storage);
 		if (sk_storage) {
 			info->skip_elems = skip_elems + count;
 			return selem;
@@ -711,10 +712,10 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 	for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
 		b = &smap->buckets[i];
-		raw_spin_lock_bh(&b->lock);
+		rcu_read_lock();
 		count = 0;
-		hlist_for_each_entry(selem, &b->list, map_node) {
+		hlist_for_each_entry_rcu(selem, &b->list, map_node) {
-			sk_storage = rcu_dereference_raw(selem->local_storage);
+			sk_storage = rcu_dereference(selem->local_storage);
 			if (sk_storage && count >= skip_elems) {
 				info->bucket_id = i;
 				info->skip_elems = count;
@@ -722,7 +723,7 @@ bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
 			}
 			count++;
 		}
-		raw_spin_unlock_bh(&b->lock);
+		rcu_read_unlock();
 		skip_elems = 0;
 	}
@@ -781,7 +782,7 @@ static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
 		ctx.meta = &meta;
 		ctx.map = info->map;
 		if (selem) {
-			sk_storage = rcu_dereference_raw(selem->local_storage);
+			sk_storage = rcu_dereference(selem->local_storage);
 			ctx.sk = sk_storage->owner;
 			ctx.value = SDATA(selem)->data;
 		}
@@ -797,18 +798,12 @@ static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
 }
 static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
+	__releases(RCU)
 {
-	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+	if (!v)
-	struct bpf_local_storage_map *smap;
-	struct bpf_local_storage_map_bucket *b;
-	if (!v) {
 		(void)__bpf_sk_storage_map_seq_show(seq, v);
-	} else {
+	else
-		smap = (struct bpf_local_storage_map *)info->map;
+		rcu_read_unlock();
-		b = &smap->buckets[info->bucket_id];
-		raw_spin_unlock_bh(&b->lock);
-	}
 }
 static int bpf_iter_init_sk_storage_map(void *priv_data,