Commit ddc1a5cb authored by Hugh Dickins's avatar Hugh Dickins Committed by Andrew Morton

mempolicy: alloc_pages_mpol() for NUMA policy without vma

Shrink shmem's stack usage by eliminating the pseudo-vma from its folio
allocation.  alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the
principal actor for passing mempolicy choice down to __alloc_pages(),
rather than vma_alloc_folio(gfp, order, vma, addr, hugepage).

vma_alloc_folio() and alloc_pages() remain, but as wrappers around
alloc_pages_mpol().  alloc_pages_bulk_*() untouched, except to provide the
additional args to policy_nodemask(), which subsumes policy_node(). 
Cleanup throughout, cutting out some unhelpful "helpers".

It would all be much simpler without MPOL_INTERLEAVE, but that adds a
dynamic to the constant mpol: complicated by v3.6 commit 09c231cb
("tmpfs: distribute interleave better across nodes"), which added ino bias
to the interleave, hidden from mm/mempolicy.c until this commit.

Hence "ilx" throughout, the "interleave index".  Originally I thought it
could be done just with nid, but that's wrong: the nodemask may come from
the shared policy layer below a shmem vma, or it may come from the task
layer above a shmem vma; and without the final nodemask then nodeid cannot
be decided.  And how ilx is applied depends also on page order.

The interleave index is almost always irrelevant unless MPOL_INTERLEAVE:
with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX
passed down from vma-less alloc_pages() is also used as hint not to use
THP-style hugepage allocation - to avoid the overhead of a hugepage arg
(though I don't understand why we never just added a GFP bit for THP - if
it actually needs a different allocation strategy from other pages of the
same order).  vma_alloc_folio() still carries its hugepage arg here, but
it is not used, and should be removed when agreed.

get_vma_policy() no longer allows a NULL vma: over time I believe we've
eradicated all the places which used to need it e.g.  swapoff and madvise
used to pass NULL vma to read_swap_cache_async(), but now know the vma.

[hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()]
  Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com
Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com
Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.comSigned-off-by: default avatarHugh Dickins <hughd@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Nhat Pham <nphamcs@gmail.com>
Cc: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Tejun heo <tj@kernel.org>
Cc: Vishal Moola (Oracle) <vishal.moola@gmail.com>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: Domenico Cerasuolo <mimmocerasuolo@gmail.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 23e48832
...@@ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v) ...@@ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v)
struct numa_maps *md = &numa_priv->md; struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file; struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
struct mempolicy *pol;
char buffer[64]; char buffer[64];
struct mempolicy *pol;
pgoff_t ilx;
int nid; int nid;
if (!mm) if (!mm)
...@@ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v) ...@@ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v)
/* Ensure we start with an empty set of numa_maps statistics. */ /* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md)); memset(md, 0, sizeof(*md));
pol = __get_vma_policy(vma, vma->vm_start); pol = __get_vma_policy(vma, vma->vm_start, &ilx);
if (pol) { if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol); mpol_to_str(buffer, sizeof(buffer), pol);
mpol_cond_put(pol); mpol_cond_put(pol);
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <linux/topology.h> #include <linux/topology.h>
struct vm_area_struct; struct vm_area_struct;
struct mempolicy;
/* Convert GFP flags to their corresponding migrate type */ /* Convert GFP flags to their corresponding migrate type */
#define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE)
...@@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, ...@@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
struct page *alloc_pages(gfp_t gfp, unsigned int order); struct page *alloc_pages(gfp_t gfp, unsigned int order);
struct folio *folio_alloc(gfp_t gfp, unsigned order); struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
struct mempolicy *mpol, pgoff_t ilx, int nid);
struct folio *folio_alloc(gfp_t gfp, unsigned int order);
struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, bool hugepage); unsigned long addr, bool hugepage);
#else #else
...@@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) ...@@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
{ {
return alloc_pages_node(numa_node_id(), gfp_mask, order); return alloc_pages_node(numa_node_id(), gfp_mask, order);
} }
static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order,
struct mempolicy *mpol, pgoff_t ilx, int nid)
{
return alloc_pages(gfp, order);
}
static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
{ {
return __folio_alloc_node(gfp, order, numa_node_id()); return __folio_alloc_node(gfp, order, numa_node_id());
......
...@@ -17,6 +17,8 @@ ...@@ -17,6 +17,8 @@
struct mm_struct; struct mm_struct;
#define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* /*
...@@ -126,7 +128,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, ...@@ -126,7 +128,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr); unsigned long addr, pgoff_t *ilx);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr, int order, pgoff_t *ilx);
bool vma_policy_mof(struct vm_area_struct *vma); bool vma_policy_mof(struct vm_area_struct *vma);
extern void numa_default_policy(void); extern void numa_default_policy(void);
...@@ -140,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma, ...@@ -140,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma,
extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask);
extern bool mempolicy_in_oom_domain(struct task_struct *tsk, extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
const nodemask_t *mask); const nodemask_t *mask);
extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
extern unsigned int mempolicy_slab_node(void); extern unsigned int mempolicy_slab_node(void);
extern enum zone_type policy_zone; extern enum zone_type policy_zone;
...@@ -179,6 +181,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); ...@@ -179,6 +181,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
struct mempolicy {}; struct mempolicy {};
static inline struct mempolicy *get_task_policy(struct task_struct *p)
{
return NULL;
}
static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
{ {
return true; return true;
...@@ -213,6 +220,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) ...@@ -213,6 +220,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx)
return NULL; return NULL;
} }
static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr, int order, pgoff_t *ilx)
{
*ilx = 0;
return NULL;
}
static inline int static inline int
vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{ {
......
...@@ -619,7 +619,7 @@ struct vm_operations_struct { ...@@ -619,7 +619,7 @@ struct vm_operations_struct {
* policy. * policy.
*/ */
struct mempolicy *(*get_policy)(struct vm_area_struct *vma, struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
unsigned long addr); unsigned long addr, pgoff_t *ilx);
#endif #endif
/* /*
* Called by vm_normal_page() for special PTEs to find the * Called by vm_normal_page() for special PTEs to find the
......
...@@ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) ...@@ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma)
} }
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
{ {
struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(vma->vm_file);
struct shm_file_data *sfd = shm_file_data(file);
int err = 0; int err = 0;
if (sfd->vm_ops->set_policy) if (sfd->vm_ops->set_policy)
err = sfd->vm_ops->set_policy(vma, new); err = sfd->vm_ops->set_policy(vma, mpol);
return err; return err;
} }
static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, static struct mempolicy *shm_get_policy(struct vm_area_struct *vma,
unsigned long addr) unsigned long addr, pgoff_t *ilx)
{ {
struct file *file = vma->vm_file; struct shm_file_data *sfd = shm_file_data(vma->vm_file);
struct shm_file_data *sfd = shm_file_data(file); struct mempolicy *mpol = vma->vm_policy;
struct mempolicy *pol = NULL;
if (sfd->vm_ops->get_policy) if (sfd->vm_ops->get_policy)
pol = sfd->vm_ops->get_policy(vma, addr); mpol = sfd->vm_ops->get_policy(vma, addr, ilx);
else if (vma->vm_policy) return mpol;
pol = vma->vm_policy;
return pol;
} }
#endif #endif
......
This diff is collapsed.
...@@ -1544,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) ...@@ -1544,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
return NULL; return NULL;
} }
#endif /* CONFIG_NUMA && CONFIG_TMPFS */ #endif /* CONFIG_NUMA && CONFIG_TMPFS */
#ifndef CONFIG_NUMA
#define vm_policy vm_private_data
#endif
static void shmem_pseudo_vma_init(struct vm_area_struct *vma,
struct shmem_inode_info *info, pgoff_t index)
{
/* Create a pseudo vma that just contains the policy */
vma_init(vma, NULL);
/* Bias interleave by inode number to distribute better across nodes */
vma->vm_pgoff = index + info->vfs_inode.i_ino;
vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index);
}
static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
{ pgoff_t index, unsigned int order, pgoff_t *ilx);
/* Drop reference taken by mpol_shared_policy_lookup() */
mpol_cond_put(vma->vm_policy);
}
static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index) struct shmem_inode_info *info, pgoff_t index)
{ {
struct vm_area_struct pvma; struct mempolicy *mpol;
pgoff_t ilx;
struct page *page; struct page *page;
struct vm_fault vmf = {
.vma = &pvma,
};
shmem_pseudo_vma_init(&pvma, info, index); mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
page = swap_cluster_readahead(swap, gfp, &vmf); page = swap_cluster_readahead(swap, gfp, mpol, ilx);
shmem_pseudo_vma_destroy(&pvma); mpol_cond_put(mpol);
if (!page) if (!page)
return NULL; return NULL;
...@@ -1609,27 +1591,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) ...@@ -1609,27 +1591,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
static struct folio *shmem_alloc_hugefolio(gfp_t gfp, static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index) struct shmem_inode_info *info, pgoff_t index)
{ {
struct vm_area_struct pvma; struct mempolicy *mpol;
struct folio *folio; pgoff_t ilx;
struct page *page;
shmem_pseudo_vma_init(&pvma, info, index); mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx);
folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id());
shmem_pseudo_vma_destroy(&pvma); mpol_cond_put(mpol);
return folio; return page_rmappable_folio(page);
} }
static struct folio *shmem_alloc_folio(gfp_t gfp, static struct folio *shmem_alloc_folio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index) struct shmem_inode_info *info, pgoff_t index)
{ {
struct vm_area_struct pvma; struct mempolicy *mpol;
struct folio *folio; pgoff_t ilx;
struct page *page;
shmem_pseudo_vma_init(&pvma, info, index); mpol = shmem_get_pgoff_policy(info, index, 0, &ilx);
folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id());
shmem_pseudo_vma_destroy(&pvma); mpol_cond_put(mpol);
return folio; return (struct folio *)page;
} }
static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, static struct folio *shmem_alloc_and_add_folio(gfp_t gfp,
...@@ -1883,7 +1867,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, ...@@ -1883,7 +1867,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
count_memcg_event_mm(fault_mm, PGMAJFAULT); count_memcg_event_mm(fault_mm, PGMAJFAULT);
} }
/* Here we actually start the io */ /* Here we actually start the io */
folio = shmem_swapin(swap, gfp, info, index); folio = shmem_swapin_cluster(swap, gfp, info, index);
if (!folio) { if (!folio) {
error = -ENOMEM; error = -ENOMEM;
goto failed; goto failed;
...@@ -2334,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) ...@@ -2334,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
} }
static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
unsigned long addr) unsigned long addr, pgoff_t *ilx)
{ {
struct inode *inode = file_inode(vma->vm_file); struct inode *inode = file_inode(vma->vm_file);
pgoff_t index; pgoff_t index;
/*
* Bias interleave by inode number to distribute better across nodes;
* but this interface is independent of which page order is used, so
* supplies only that bias, letting caller apply the offset (adjusted
* by page order, as in shmem_get_pgoff_policy() and get_vma_policy()).
*/
*ilx = inode->i_ino;
index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
} }
#endif
static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
pgoff_t index, unsigned int order, pgoff_t *ilx)
{
struct mempolicy *mpol;
/* Bias interleave by inode number to distribute better across nodes */
*ilx = info->vfs_inode.i_ino + (index >> order);
mpol = mpol_shared_policy_lookup(&info->policy, index);
return mpol ? mpol : get_task_policy(current);
}
#else
static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info,
pgoff_t index, unsigned int order, pgoff_t *ilx)
{
*ilx = 0;
return NULL;
}
#endif /* CONFIG_NUMA */
int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{ {
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
#ifndef _MM_SWAP_H #ifndef _MM_SWAP_H
#define _MM_SWAP_H #define _MM_SWAP_H
struct mempolicy;
#ifdef CONFIG_SWAP #ifdef CONFIG_SWAP
#include <linux/blk_types.h> /* for bio_end_io_t */ #include <linux/blk_types.h> /* for bio_end_io_t */
...@@ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, ...@@ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
unsigned long addr, unsigned long addr,
struct swap_iocb **plug); struct swap_iocb **plug);
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, struct mempolicy *mpol, pgoff_t ilx,
unsigned long addr,
bool *new_page_allocated); bool *new_page_allocated);
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf); struct mempolicy *mpol, pgoff_t ilx);
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
struct vm_fault *vmf); struct vm_fault *vmf);
...@@ -80,7 +81,7 @@ static inline void show_swap_cache_info(void) ...@@ -80,7 +81,7 @@ static inline void show_swap_cache_info(void)
} }
static inline struct page *swap_cluster_readahead(swp_entry_t entry, static inline struct page *swap_cluster_readahead(swp_entry_t entry,
gfp_t gfp_mask, struct vm_fault *vmf) gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx)
{ {
return NULL; return NULL;
} }
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/kernel_stat.h> #include <linux/kernel_stat.h>
#include <linux/mempolicy.h>
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/swapops.h> #include <linux/swapops.h>
#include <linux/init.h> #include <linux/init.h>
...@@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, ...@@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
} }
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr, struct mempolicy *mpol, pgoff_t ilx,
bool *new_page_allocated) bool *new_page_allocated)
{ {
struct swap_info_struct *si; struct swap_info_struct *si;
struct folio *folio; struct folio *folio;
...@@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, ...@@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
* cause any racers to loop around until we add it to cache. * cause any racers to loop around until we add it to cache.
*/ */
folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0,
mpol, ilx, numa_node_id());
if (!folio) if (!folio)
goto fail_put_swap; goto fail_put_swap;
...@@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, ...@@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, struct vm_area_struct *vma,
unsigned long addr, struct swap_iocb **plug) unsigned long addr, struct swap_iocb **plug)
{ {
bool page_was_allocated; bool page_allocated;
struct page *retpage = __read_swap_cache_async(entry, gfp_mask, struct mempolicy *mpol;
vma, addr, &page_was_allocated); pgoff_t ilx;
struct page *page;
if (page_was_allocated) mpol = get_vma_policy(vma, addr, 0, &ilx);
swap_readpage(retpage, false, plug); page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated);
mpol_cond_put(mpol);
return retpage; if (page_allocated)
swap_readpage(page, false, plug);
return page;
} }
static unsigned int __swapin_nr_pages(unsigned long prev_offset, static unsigned int __swapin_nr_pages(unsigned long prev_offset,
...@@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset) ...@@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* swap_cluster_readahead - swap in pages in hope we need them soon * swap_cluster_readahead - swap in pages in hope we need them soon
* @entry: swap entry of this memory * @entry: swap entry of this memory
* @gfp_mask: memory allocation flags * @gfp_mask: memory allocation flags
* @vmf: fault information * @mpol: NUMA memory allocation policy to be applied
* @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
* *
* Returns the struct page for entry and addr, after queueing swapin. * Returns the struct page for entry and addr, after queueing swapin.
* *
...@@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset) ...@@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* because it doesn't cost us any seek time. We also make sure to queue * because it doesn't cost us any seek time. We also make sure to queue
* the 'original' request together with the readahead ones... * the 'original' request together with the readahead ones...
* *
* This has been extended to use the NUMA policies from the mm triggering * Note: it is intentional that the same NUMA policy and interleave index
* the readahead. * are used for every page of the readahead: neighbouring pages on swap
* * are fairly likely to have been swapped out from the same node.
* Caller must hold read mmap_lock if vmf->vma is not NULL.
*/ */
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf) struct mempolicy *mpol, pgoff_t ilx)
{ {
struct page *page; struct page *page;
unsigned long entry_offset = swp_offset(entry); unsigned long entry_offset = swp_offset(entry);
...@@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, ...@@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct blk_plug plug; struct blk_plug plug;
struct swap_iocb *splug = NULL; struct swap_iocb *splug = NULL;
bool page_allocated; bool page_allocated;
struct vm_area_struct *vma = vmf->vma;
unsigned long addr = vmf->address;
mask = swapin_nr_pages(offset) - 1; mask = swapin_nr_pages(offset) - 1;
if (!mask) if (!mask)
...@@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, ...@@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
for (offset = start_offset; offset <= end_offset ; offset++) { for (offset = start_offset; offset <= end_offset ; offset++) {
/* Ok, do the async read-ahead now */ /* Ok, do the async read-ahead now */
page = __read_swap_cache_async( page = __read_swap_cache_async(
swp_entry(swp_type(entry), offset), swp_entry(swp_type(entry), offset),
gfp_mask, vma, addr, &page_allocated); gfp_mask, mpol, ilx, &page_allocated);
if (!page) if (!page)
continue; continue;
if (page_allocated) { if (page_allocated) {
...@@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, ...@@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
} }
blk_finish_plug(&plug); blk_finish_plug(&plug);
swap_read_unplug(splug); swap_read_unplug(splug);
lru_add_drain(); /* Push any new pages onto the LRU now */ lru_add_drain(); /* Push any new pages onto the LRU now */
skip: skip:
/* The page was likely read above, so no need for plugging here */ /* The page was likely read above, so no need for plugging here */
return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated);
if (unlikely(page_allocated))
swap_readpage(page, false, NULL);
return page;
} }
int init_swap_address_space(unsigned int type, unsigned long nr_pages) int init_swap_address_space(unsigned int type, unsigned long nr_pages)
...@@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf, ...@@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf,
/** /**
* swap_vma_readahead - swap in pages in hope we need them soon * swap_vma_readahead - swap in pages in hope we need them soon
* @fentry: swap entry of this memory * @targ_entry: swap entry of the targeted memory
* @gfp_mask: memory allocation flags * @gfp_mask: memory allocation flags
* @mpol: NUMA memory allocation policy to be applied
* @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE
* @vmf: fault information * @vmf: fault information
* *
* Returns the struct page for entry and addr, after queueing swapin. * Returns the struct page for entry and addr, after queueing swapin.
...@@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf, ...@@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf,
* Caller must hold read mmap_lock if vmf->vma is not NULL. * Caller must hold read mmap_lock if vmf->vma is not NULL.
* *
*/ */
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t targ_ilx,
struct vm_fault *vmf) struct vm_fault *vmf)
{ {
struct blk_plug plug; struct blk_plug plug;
struct swap_iocb *splug = NULL; struct swap_iocb *splug = NULL;
struct vm_area_struct *vma = vmf->vma;
struct page *page; struct page *page;
pte_t *pte = NULL, pentry; pte_t *pte = NULL, pentry;
unsigned long addr; unsigned long addr;
swp_entry_t entry; swp_entry_t entry;
pgoff_t ilx;
unsigned int i; unsigned int i;
bool page_allocated; bool page_allocated;
struct vma_swap_readahead ra_info = { struct vma_swap_readahead ra_info = {
...@@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, ...@@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
goto skip; goto skip;
addr = vmf->address - (ra_info.offset * PAGE_SIZE); addr = vmf->address - (ra_info.offset * PAGE_SIZE);
ilx = targ_ilx - ra_info.offset;
blk_start_plug(&plug); blk_start_plug(&plug);
for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) {
if (!pte++) { if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr); pte = pte_offset_map(vmf->pmd, addr);
if (!pte) if (!pte)
...@@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, ...@@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
continue; continue;
pte_unmap(pte); pte_unmap(pte);
pte = NULL; pte = NULL;
page = __read_swap_cache_async(entry, gfp_mask, vma, page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
addr, &page_allocated); &page_allocated);
if (!page) if (!page)
continue; continue;
if (page_allocated) { if (page_allocated) {
...@@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, ...@@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
lru_add_drain(); lru_add_drain();
skip: skip:
/* The page was likely read above, so no need for plugging here */ /* The page was likely read above, so no need for plugging here */
return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
NULL); &page_allocated);
if (unlikely(page_allocated))
swap_readpage(page, false, NULL);
return page;
} }
/** /**
...@@ -853,9 +868,16 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, ...@@ -853,9 +868,16 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf) struct vm_fault *vmf)
{ {
return swap_use_vma_readahead() ? struct mempolicy *mpol;
swap_vma_readahead(entry, gfp_mask, vmf) : pgoff_t ilx;
swap_cluster_readahead(entry, gfp_mask, vmf); struct page *page;
mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx);
page = swap_use_vma_readahead() ?
swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) :
swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
mpol_cond_put(mpol);
return page;
} }
#ifdef CONFIG_SYSFS #ifdef CONFIG_SYSFS
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/crypto.h> #include <linux/crypto.h>
#include <linux/scatterlist.h> #include <linux/scatterlist.h>
#include <linux/mempolicy.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/zpool.h> #include <linux/zpool.h>
#include <crypto/acompress.h> #include <crypto/acompress.h>
...@@ -1057,6 +1058,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, ...@@ -1057,6 +1058,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
{ {
swp_entry_t swpentry = entry->swpentry; swp_entry_t swpentry = entry->swpentry;
struct page *page; struct page *page;
struct mempolicy *mpol;
struct scatterlist input, output; struct scatterlist input, output;
struct crypto_acomp_ctx *acomp_ctx; struct crypto_acomp_ctx *acomp_ctx;
struct zpool *pool = zswap_find_zpool(entry); struct zpool *pool = zswap_find_zpool(entry);
...@@ -1075,8 +1077,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry, ...@@ -1075,8 +1077,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
} }
/* try to allocate swap cache page */ /* try to allocate swap cache page */
page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, mpol = get_task_policy(current);
&page_was_allocated); page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol,
NO_INTERLEAVE_INDEX, &page_was_allocated);
if (!page) { if (!page) {
ret = -ENOMEM; ret = -ENOMEM;
goto fail; goto fail;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment