Commit 89c83fb5 authored by Michal Hocko's avatar Michal Hocko Committed by Linus Torvalds

mm, thp: consolidate THP gfp handling into alloc_hugepage_direct_gfpmask

THP allocation mode is quite complex and it depends on the defrag mode.
This complexity is hidden in alloc_hugepage_direct_gfpmask from a large
part currently. The NUMA special casing (namely __GFP_THISNODE) is
however independent and placed in alloc_pages_vma currently. This both
adds an unnecessary branch to all vma based page allocation requests and
it makes the code more complex unnecessarily as well. Not to mention
that e.g. shmem THP used to do the node reclaiming unconditionally
regardless of the defrag mode until recently. This was not only
unexpected behavior but it was also hardly a good default behavior and I
strongly suspect it was just a side effect of the code sharing more than
a deliberate decision which suggests that such a layering is wrong.

Get rid of the thp special casing from alloc_pages_vma and move the
logic to alloc_hugepage_direct_gfpmask. __GFP_THISNODE is applied to the
resulting gfp mask only when the direct reclaim is not requested and
when there is no explicit numa binding to preserve the current logic.

Please note that there's also a slight difference wrt MPOL_BIND now. The
previous code would avoid using __GFP_THISNODE if the local node was
outside of policy_nodemask(). After this patch __GFP_THISNODE is avoided
for all MPOL_BIND policies. So there's a difference that if local node
is actually allowed by the bind policy's nodemask, previously
__GFP_THISNODE would be added, but now it won't be. From the behavior
POV this is still correct because the policy nodemask is used.

Link: http://lkml.kernel.org/r/20180925120326.24392-3-mhocko@kernel.orgSigned-off-by: default avatarMichal Hocko <mhocko@suse.com>
Acked-by: default avatarVlastimil Babka <vbabka@suse.cz>
Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Rientjes <rientjes@google.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Stefan Priebe - Profihost AG <s.priebe@profihost.ag>
Cc: Zi Yan <zi.yan@cs.rutgers.edu>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 6194ae42
...@@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) ...@@ -510,22 +510,18 @@ alloc_pages(gfp_t gfp_mask, unsigned int order)
} }
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
struct vm_area_struct *vma, unsigned long addr, struct vm_area_struct *vma, unsigned long addr,
int node, bool hugepage); int node);
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
#else #else
#define alloc_pages(gfp_mask, order) \ #define alloc_pages(gfp_mask, order) \
alloc_pages_node(numa_node_id(), gfp_mask, order) alloc_pages_node(numa_node_id(), gfp_mask, order)
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ #define alloc_pages_vma(gfp_mask, order, vma, addr, node)\
alloc_pages(gfp_mask, order)
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
alloc_pages(gfp_mask, order) alloc_pages(gfp_mask, order)
#endif #endif
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
#define alloc_page_vma(gfp_mask, vma, addr) \ #define alloc_page_vma(gfp_mask, vma, addr) \
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id())
#define alloc_page_vma_node(gfp_mask, vma, addr, node) \ #define alloc_page_vma_node(gfp_mask, vma, addr, node) \
alloc_pages_vma(gfp_mask, 0, vma, addr, node, false) alloc_pages_vma(gfp_mask, 0, vma, addr, node)
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
extern unsigned long get_zeroed_page(gfp_t gfp_mask); extern unsigned long get_zeroed_page(gfp_t gfp_mask);
......
...@@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, ...@@ -139,6 +139,8 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp,
struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *get_task_policy(struct task_struct *p);
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr); unsigned long addr);
struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr);
bool vma_policy_mof(struct vm_area_struct *vma); bool vma_policy_mof(struct vm_area_struct *vma);
extern void numa_default_policy(void); extern void numa_default_policy(void);
......
...@@ -629,21 +629,40 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, ...@@ -629,21 +629,40 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
* available * available
* never: never stall for any thp allocation * never: never stall for any thp allocation
*/ */
static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma, unsigned long addr)
{ {
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
gfp_t this_node = 0;
#ifdef CONFIG_NUMA
struct mempolicy *pol;
/*
* __GFP_THISNODE is used only when __GFP_DIRECT_RECLAIM is not
* specified, to express a general desire to stay on the current
* node for optimistic allocation attempts. If the defrag mode
* and/or madvise hint requires the direct reclaim then we prefer
* to fallback to other node rather than node reclaim because that
* can lead to excessive reclaim even though there is free memory
* on other nodes. We expect that NUMA preferences are specified
* by memory policies.
*/
pol = get_vma_policy(vma, addr);
if (pol->mode != MPOL_BIND)
this_node = __GFP_THISNODE;
mpol_cond_put(pol);
#endif
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags)) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY); return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags)) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM; return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM | this_node;
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags)) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
__GFP_KSWAPD_RECLAIM); __GFP_KSWAPD_RECLAIM | this_node);
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags)) if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM : return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
0); this_node);
return GFP_TRANSHUGE_LIGHT; return GFP_TRANSHUGE_LIGHT | this_node;
} }
/* Caller must hold page table lock. */ /* Caller must hold page table lock. */
...@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) ...@@ -715,8 +734,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
pte_free(vma->vm_mm, pgtable); pte_free(vma->vm_mm, pgtable);
return ret; return ret;
} }
gfp = alloc_hugepage_direct_gfpmask(vma); gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, vma, haddr, numa_node_id());
if (unlikely(!page)) { if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
...@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) ...@@ -1286,8 +1305,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
alloc: alloc:
if (transparent_hugepage_enabled(vma) && if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) { !transparent_hugepage_debug_cow()) {
huge_gfp = alloc_hugepage_direct_gfpmask(vma); huge_gfp = alloc_hugepage_direct_gfpmask(vma, haddr);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER); new_page = alloc_pages_vma(huge_gfp, HPAGE_PMD_ORDER, vma,
haddr, numa_node_id());
} else } else
new_page = NULL; new_page = NULL;
......
...@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start) ...@@ -1116,8 +1116,8 @@ static struct page *new_page(struct page *page, unsigned long start)
} else if (PageTransHuge(page)) { } else if (PageTransHuge(page)) {
struct page *thp; struct page *thp;
thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, thp = alloc_pages_vma(GFP_TRANSHUGE, HPAGE_PMD_ORDER, vma,
HPAGE_PMD_ORDER); address, numa_node_id());
if (!thp) if (!thp)
return NULL; return NULL;
prep_transhuge_page(thp); prep_transhuge_page(thp);
...@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, ...@@ -1662,7 +1662,7 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
* freeing by another task. It is the caller's responsibility to free the * freeing by another task. It is the caller's responsibility to free the
* extra reference for shared policies. * extra reference for shared policies.
*/ */
static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, struct mempolicy *get_vma_policy(struct vm_area_struct *vma,
unsigned long addr) unsigned long addr)
{ {
struct mempolicy *pol = __get_vma_policy(vma, addr); struct mempolicy *pol = __get_vma_policy(vma, addr);
...@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, ...@@ -2011,7 +2011,6 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* @vma: Pointer to VMA or NULL if not available. * @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA. * @addr: Virtual Address of the allocation. Must be inside the VMA.
* @node: Which node to prefer for allocation (modulo policy). * @node: Which node to prefer for allocation (modulo policy).
* @hugepage: for hugepages try only the preferred node if possible
* *
* This function allocates a page from the kernel page pool and applies * This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process. * a NUMA policy associated with the VMA or the current process.
...@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, ...@@ -2022,7 +2021,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
*/ */
struct page * struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage) unsigned long addr, int node)
{ {
struct mempolicy *pol; struct mempolicy *pol;
struct page *page; struct page *page;
...@@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, ...@@ -2040,60 +2039,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
goto out; goto out;
} }
if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
int hpage_node = node;
/*
* For hugepage allocation and non-interleave policy which
* allows the current node (or other explicitly preferred
* node) we only try to allocate from the current/preferred
* node and don't fall back to other nodes, as the cost of
* remote accesses would likely offset THP benefits.
*
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
if (pol->mode == MPOL_PREFERRED &&
!(pol->flags & MPOL_F_LOCAL))
hpage_node = pol->v.preferred_node;
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
/*
* We cannot invoke reclaim if __GFP_THISNODE
* is set. Invoking reclaim with
* __GFP_THISNODE set, would cause THP
* allocations to trigger heavy swapping
* despite there may be tons of free memory
* (including potentially plenty of THP
* already available in the buddy) on all the
* other NUMA nodes.
*
* At most we could invoke compaction when
* __GFP_THISNODE is set (but we would need to
* refrain from invoking reclaim even if
* compaction returned COMPACT_SKIPPED because
* there wasn't not enough memory to succeed
* compaction). For now just avoid
* __GFP_THISNODE instead of limiting the
* allocation path to a strict and single
* compaction invocation.
*
* Supposedly if direct reclaim was enabled by
* the caller, the app prefers THP regardless
* of the node it comes from so this would be
* more desiderable behavior than only
* providing THP originated from the local
* node in such case.
*/
if (!(gfp & __GFP_DIRECT_RECLAIM))
gfp |= __GFP_THISNODE;
page = __alloc_pages_node(hpage_node, gfp, order);
goto out;
}
}
nmask = policy_nodemask(gfp, pol); nmask = policy_nodemask(gfp, pol);
preferred_nid = policy_node(gfp, pol, node); preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
......
...@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, ...@@ -1435,7 +1435,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_init(&pvma, info, hindex); shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); HPAGE_PMD_ORDER, &pvma, 0, numa_node_id());
shmem_pseudo_vma_destroy(&pvma); shmem_pseudo_vma_destroy(&pvma);
if (page) if (page)
prep_transhuge_page(page); prep_transhuge_page(page);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment