Commit 0e400844 authored by Nhat Pham's avatar Nhat Pham Committed by Andrew Morton

zswap: track swapins from disk more accurately

Currently, there are a couple of issues with our disk swapin tracking for
dynamic zswap shrinker heuristics:

1. We only increment the swapin counter on pivot pages. This means we
   are not taking into account pages that also need to be swapped in,
   but are already taken care of as part of the readahead window.

2. We are also incrementing when the pages are read from the zswap pool,
   which is inaccurate.

This patch rectifies these issues by incrementing the counter whenever we
need to perform a non-zswap read.  Note that we are slightly overcounting,
as a page might be read into memory by the readahead algorithm even though
it will not be neeeded by users - however, this is an acceptable
inaccuracy, as the readahead logic itself will adapt to these kind of
scenarios.

To test this change, I built the kernel under a cgroup with its memory.max
set to 2 GB:

real: 236.66s
user: 4286.06s
sys: 652.86s
swapins: 81552

For comparison, with just the new second chance algorithm, the build time
is as follows:

real: 244.85s
user: 4327.22s
sys: 664.39s
swapins: 94663

Without neither:

real: 263.89s
user: 4318.11s
sys: 673.29s
swapins: 227300.5

(average over 5 runs)

With this change, the kernel CPU time reduces by a further 1.7%, and the
real time is reduced by another 3.3%, compared to just the second chance
algorithm by itself.  The swapins count also reduces by another 13.85%.

Combinng the two changes, we reduce the real time by 10.32%, kernel CPU
time by 3%, and number of swapins by 64.12%.

To gauge the new scheme's ability to offload cold data, I ran another
benchmark, in which the kernel was built under a cgroup with memory.max
set to 3 GB, but with 0.5 GB worth of cold data allocated before each
build (in a shmem file).

Under the old scheme:

real: 197.18s
user: 4365.08s
sys: 289.02s
zswpwb: 72115.2

Under the new scheme:

real: 195.8s
user: 4362.25s
sys: 290.14s
zswpwb: 87277.8

(average over 5 runs)

Notice that we actually observe a 21% increase in the number of written
back pages - so the new scheme is just as good, if not better at
offloading pages from the zswap pool when they are cold.  Build time
reduces by around 0.7% as a result.

[nphamcs@gmail.com: squeeze a comment into a single line]
  Link: https://lkml.kernel.org/r/20240806004518.3183562-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20240805232243.2896283-3-nphamcs@gmail.com
Fixes: b5ba474f ("zswap: shrink zswap pool based on memory pressure")
Signed-off-by: default avatarNhat Pham <nphamcs@gmail.com>
Suggested-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarYosry Ahmed <yosryahmed@google.com>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Cc: Chengming Zhou <chengming.zhou@linux.dev>
Cc: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Takero Funaki <flintglass@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent e31c38e0
...@@ -521,7 +521,13 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) ...@@ -521,7 +521,13 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
if (zswap_load(folio)) { if (zswap_load(folio)) {
folio_unlock(folio); folio_unlock(folio);
} else if (data_race(sis->flags & SWP_FS_OPS)) { goto finish;
}
/* We have to read from slower devices. Increase zswap protection. */
zswap_folio_swapin(folio);
if (data_race(sis->flags & SWP_FS_OPS)) {
swap_read_folio_fs(folio, plug); swap_read_folio_fs(folio, plug);
} else if (synchronous) { } else if (synchronous) {
swap_read_folio_bdev_sync(folio, sis); swap_read_folio_bdev_sync(folio, sis);
...@@ -529,6 +535,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug) ...@@ -529,6 +535,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
swap_read_folio_bdev_async(folio, sis); swap_read_folio_bdev_async(folio, sis);
} }
finish:
if (workingset) { if (workingset) {
delayacct_thrashing_end(&in_thrashing); delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags); psi_memstall_leave(&pflags);
......
...@@ -702,10 +702,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, ...@@ -702,10 +702,8 @@ struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
/* The page was likely read above, so no need for plugging here */ /* The page was likely read above, so no need for plugging here */
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false); &page_allocated, false);
if (unlikely(page_allocated)) { if (unlikely(page_allocated))
zswap_folio_swapin(folio);
swap_read_folio(folio, NULL); swap_read_folio(folio, NULL);
}
return folio; return folio;
} }
...@@ -854,10 +852,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, ...@@ -854,10 +852,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
/* The folio was likely read above, so no need for plugging here */ /* The folio was likely read above, so no need for plugging here */
folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
&page_allocated, false); &page_allocated, false);
if (unlikely(page_allocated)) { if (unlikely(page_allocated))
zswap_folio_swapin(folio);
swap_read_folio(folio, NULL); swap_read_folio(folio, NULL);
}
return folio; return folio;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment