Commit 5e911373 authored by Mike Kravetz's avatar Mike Kravetz Committed by Linus Torvalds

mm/hugetlb: add cache of descriptors to resv_map for region_add

hugetlbfs is used today by applications that want a high degree of
control over huge page usage.  Often, large hugetlbfs files are used to
map a large number huge pages into the application processes.  The
applications know when page ranges within these large files will no
longer be used, and ideally would like to release them back to the
subpool or global pools for other uses.  The fallocate() system call
provides an interface for preallocation and hole punching within files.
This patch set adds fallocate functionality to hugetlbfs.

fallocate hole punch will want to remove a specific range of pages.
When pages are removed, their associated entries in the region/reserve
map will also be removed.  This will break an assumption in the
region_chg/region_add calling sequence.  If a new region descriptor must
be allocated, it is done as part of the region_chg processing.  In this
way, region_add can not fail because it does not need to attempt an
allocation.

To prepare for fallocate hole punch, create a "cache" of descriptors
that can be used by region_add if necessary.  region_chg will ensure
there are sufficient entries in the cache.  It will be necessary to
track the number of in progress add operations to know a sufficient
number of descriptors reside in the cache.  A new routine region_abort
is added to adjust this in progress count when add operations are
aborted.  vma_abort_reservation is also added for callers creating
reservations with vma_needs_reservation/vma_commit_reservation.

[akpm@linux-foundation.org: fix typo in comment, use more cols]
Signed-off-by: default avatarMike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: default avatarHillf Danton <hillf.zj@alibaba-inc.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Aneesh Kumar <aneesh.kumar@linux.vnet.ibm.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent bb14c2c7
...@@ -35,6 +35,9 @@ struct resv_map { ...@@ -35,6 +35,9 @@ struct resv_map {
struct kref refs; struct kref refs;
spinlock_t lock; spinlock_t lock;
struct list_head regions; struct list_head regions;
long adds_in_progress;
struct list_head region_cache;
long region_cache_count;
}; };
extern struct resv_map *resv_map_alloc(void); extern struct resv_map *resv_map_alloc(void);
void resv_map_release(struct kref *ref); void resv_map_release(struct kref *ref);
......
...@@ -240,11 +240,14 @@ struct file_region { ...@@ -240,11 +240,14 @@ struct file_region {
/* /*
* Add the huge page range represented by [f, t) to the reserve * Add the huge page range represented by [f, t) to the reserve
* map. Existing regions will be expanded to accommodate the * map. In the normal case, existing regions will be expanded
* specified range. We know only existing regions need to be * to accommodate the specified range. Sufficient regions should
* expanded, because region_add is only called after region_chg * exist for expansion due to the previous call to region_chg
* with the same range. If a new file_region structure must * with the same range. However, it is possible that region_del
* be allocated, it is done in region_chg. * could have been called after region_chg and modifed the map
* in such a way that no region exists to be expanded. In this
* case, pull a region descriptor from the cache associated with
* the map and use that for the new range.
* *
* Return the number of new huge pages added to the map. This * Return the number of new huge pages added to the map. This
* number is greater than or equal to zero. * number is greater than or equal to zero.
...@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t) ...@@ -261,6 +264,28 @@ static long region_add(struct resv_map *resv, long f, long t)
if (f <= rg->to) if (f <= rg->to)
break; break;
/*
* If no region exists which can be expanded to include the
* specified range, the list must have been modified by an
* interleving call to region_del(). Pull a region descriptor
* from the cache and use it for this range.
*/
if (&rg->link == head || t < rg->from) {
VM_BUG_ON(resv->region_cache_count <= 0);
resv->region_cache_count--;
nrg = list_first_entry(&resv->region_cache, struct file_region,
link);
list_del(&nrg->link);
nrg->from = f;
nrg->to = t;
list_add(&nrg->link, rg->link.prev);
add += t - f;
goto out_locked;
}
/* Round our left edge to the current segment if it encloses us. */ /* Round our left edge to the current segment if it encloses us. */
if (f > rg->from) if (f > rg->from)
f = rg->from; f = rg->from;
...@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t) ...@@ -294,6 +319,8 @@ static long region_add(struct resv_map *resv, long f, long t)
add += t - nrg->to; /* Added to end of region */ add += t - nrg->to; /* Added to end of region */
nrg->to = t; nrg->to = t;
out_locked:
resv->adds_in_progress--;
spin_unlock(&resv->lock); spin_unlock(&resv->lock);
VM_BUG_ON(add < 0); VM_BUG_ON(add < 0);
return add; return add;
...@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t) ...@@ -312,11 +339,14 @@ static long region_add(struct resv_map *resv, long f, long t)
* so that the subsequent region_add call will have all the * so that the subsequent region_add call will have all the
* regions it needs and will not fail. * regions it needs and will not fail.
* *
* Returns the number of huge pages that need to be added * Upon entry, region_chg will also examine the cache of region descriptors
* to the existing reservation map for the range [f, t). * associated with the map. If there are not enough descriptors cached, one
* This number is greater or equal to zero. -ENOMEM is * will be allocated for the in progress add operation.
* returned if a new file_region structure is needed and can *
* not be allocated. * Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t). This number is greater or equal to
* zero. -ENOMEM is returned if a new file_region structure or cache entry
* is needed and can not be allocated.
*/ */
static long region_chg(struct resv_map *resv, long f, long t) static long region_chg(struct resv_map *resv, long f, long t)
{ {
...@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t) ...@@ -326,6 +356,31 @@ static long region_chg(struct resv_map *resv, long f, long t)
retry: retry:
spin_lock(&resv->lock); spin_lock(&resv->lock);
retry_locked:
resv->adds_in_progress++;
/*
* Check for sufficient descriptors in the cache to accommodate
* the number of in progress add operations.
*/
if (resv->adds_in_progress > resv->region_cache_count) {
struct file_region *trg;
VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
/* Must drop lock to allocate a new descriptor. */
resv->adds_in_progress--;
spin_unlock(&resv->lock);
trg = kmalloc(sizeof(*trg), GFP_KERNEL);
if (!trg)
return -ENOMEM;
spin_lock(&resv->lock);
list_add(&trg->link, &resv->region_cache);
resv->region_cache_count++;
goto retry_locked;
}
/* Locate the region we are before or in. */ /* Locate the region we are before or in. */
list_for_each_entry(rg, head, link) list_for_each_entry(rg, head, link)
if (f <= rg->to) if (f <= rg->to)
...@@ -336,6 +391,7 @@ static long region_chg(struct resv_map *resv, long f, long t) ...@@ -336,6 +391,7 @@ static long region_chg(struct resv_map *resv, long f, long t)
* size such that we can guarantee to record the reservation. */ * size such that we can guarantee to record the reservation. */
if (&rg->link == head || t < rg->from) { if (&rg->link == head || t < rg->from) {
if (!nrg) { if (!nrg) {
resv->adds_in_progress--;
spin_unlock(&resv->lock); spin_unlock(&resv->lock);
nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
if (!nrg) if (!nrg)
...@@ -384,6 +440,25 @@ static long region_chg(struct resv_map *resv, long f, long t) ...@@ -384,6 +440,25 @@ static long region_chg(struct resv_map *resv, long f, long t)
return chg; return chg;
} }
/*
* Abort the in progress add operation. The adds_in_progress field
* of the resv_map keeps track of the operations in progress between
* calls to region_chg and region_add. Operations are sometimes
* aborted after the call to region_chg. In such cases, region_abort
* is called to decrement the adds_in_progress counter.
*
* NOTE: The range arguments [f, t) are not needed or used in this
* routine. They are kept to make reading the calling code easier as
* arguments will match the associated region_chg call.
*/
static void region_abort(struct resv_map *resv, long f, long t)
{
spin_lock(&resv->lock);
VM_BUG_ON(!resv->region_cache_count);
resv->adds_in_progress--;
spin_unlock(&resv->lock);
}
/* /*
* Truncate the reserve map at index 'end'. Modify/truncate any * Truncate the reserve map at index 'end'. Modify/truncate any
* region which contains end. Delete any regions past end. * region which contains end. Delete any regions past end.
...@@ -544,22 +619,44 @@ static void set_vma_private_data(struct vm_area_struct *vma, ...@@ -544,22 +619,44 @@ static void set_vma_private_data(struct vm_area_struct *vma,
struct resv_map *resv_map_alloc(void) struct resv_map *resv_map_alloc(void)
{ {
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
if (!resv_map) struct file_region *rg = kmalloc(sizeof(*rg), GFP_KERNEL);
if (!resv_map || !rg) {
kfree(resv_map);
kfree(rg);
return NULL; return NULL;
}
kref_init(&resv_map->refs); kref_init(&resv_map->refs);
spin_lock_init(&resv_map->lock); spin_lock_init(&resv_map->lock);
INIT_LIST_HEAD(&resv_map->regions); INIT_LIST_HEAD(&resv_map->regions);
resv_map->adds_in_progress = 0;
INIT_LIST_HEAD(&resv_map->region_cache);
list_add(&rg->link, &resv_map->region_cache);
resv_map->region_cache_count = 1;
return resv_map; return resv_map;
} }
void resv_map_release(struct kref *ref) void resv_map_release(struct kref *ref)
{ {
struct resv_map *resv_map = container_of(ref, struct resv_map, refs); struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
struct list_head *head = &resv_map->region_cache;
struct file_region *rg, *trg;
/* Clear out any active regions before we release the map. */ /* Clear out any active regions before we release the map. */
region_truncate(resv_map, 0); region_truncate(resv_map, 0);
/* ... and any entries left in the cache */
list_for_each_entry_safe(rg, trg, head, link) {
list_del(&rg->link);
kfree(rg);
}
VM_BUG_ON(resv_map->adds_in_progress);
kfree(resv_map); kfree(resv_map);
} }
...@@ -1473,16 +1570,18 @@ static void return_unused_surplus_pages(struct hstate *h, ...@@ -1473,16 +1570,18 @@ static void return_unused_surplus_pages(struct hstate *h,
} }
} }
/* /*
* vma_needs_reservation and vma_commit_reservation are used by the huge * vma_needs_reservation, vma_commit_reservation and vma_abort_reservation
* page allocation routines to manage reservations. * are used by the huge page allocation routines to manage reservations.
* *
* vma_needs_reservation is called to determine if the huge page at addr * vma_needs_reservation is called to determine if the huge page at addr
* within the vma has an associated reservation. If a reservation is * within the vma has an associated reservation. If a reservation is
* needed, the value 1 is returned. The caller is then responsible for * needed, the value 1 is returned. The caller is then responsible for
* managing the global reservation and subpool usage counts. After * managing the global reservation and subpool usage counts. After
* the huge page has been allocated, vma_commit_reservation is called * the huge page has been allocated, vma_commit_reservation is called
* to add the page to the reservation map. * to add the page to the reservation map. If the reservation must be
* aborted instead of committed, vma_abort_reservation is called.
* *
* In the normal case, vma_commit_reservation returns the same value * In the normal case, vma_commit_reservation returns the same value
* as the preceding vma_needs_reservation call. The only time this * as the preceding vma_needs_reservation call. The only time this
...@@ -1490,9 +1589,14 @@ static void return_unused_surplus_pages(struct hstate *h, ...@@ -1490,9 +1589,14 @@ static void return_unused_surplus_pages(struct hstate *h,
* is the responsibility of the caller to notice the difference and * is the responsibility of the caller to notice the difference and
* take appropriate action. * take appropriate action.
*/ */
enum vma_resv_mode {
VMA_NEEDS_RESV,
VMA_COMMIT_RESV,
VMA_ABORT_RESV,
};
static long __vma_reservation_common(struct hstate *h, static long __vma_reservation_common(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr, struct vm_area_struct *vma, unsigned long addr,
bool commit) enum vma_resv_mode mode)
{ {
struct resv_map *resv; struct resv_map *resv;
pgoff_t idx; pgoff_t idx;
...@@ -1503,10 +1607,20 @@ static long __vma_reservation_common(struct hstate *h, ...@@ -1503,10 +1607,20 @@ static long __vma_reservation_common(struct hstate *h,
return 1; return 1;
idx = vma_hugecache_offset(h, vma, addr); idx = vma_hugecache_offset(h, vma, addr);
if (commit) switch (mode) {
ret = region_add(resv, idx, idx + 1); case VMA_NEEDS_RESV:
else
ret = region_chg(resv, idx, idx + 1); ret = region_chg(resv, idx, idx + 1);
break;
case VMA_COMMIT_RESV:
ret = region_add(resv, idx, idx + 1);
break;
case VMA_ABORT_RESV:
region_abort(resv, idx, idx + 1);
ret = 0;
break;
default:
BUG();
}
if (vma->vm_flags & VM_MAYSHARE) if (vma->vm_flags & VM_MAYSHARE)
return ret; return ret;
...@@ -1517,13 +1631,19 @@ static long __vma_reservation_common(struct hstate *h, ...@@ -1517,13 +1631,19 @@ static long __vma_reservation_common(struct hstate *h,
static long vma_needs_reservation(struct hstate *h, static long vma_needs_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr) struct vm_area_struct *vma, unsigned long addr)
{ {
return __vma_reservation_common(h, vma, addr, false); return __vma_reservation_common(h, vma, addr, VMA_NEEDS_RESV);
} }
static long vma_commit_reservation(struct hstate *h, static long vma_commit_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr) struct vm_area_struct *vma, unsigned long addr)
{ {
return __vma_reservation_common(h, vma, addr, true); return __vma_reservation_common(h, vma, addr, VMA_COMMIT_RESV);
}
static void vma_abort_reservation(struct hstate *h,
struct vm_area_struct *vma, unsigned long addr)
{
(void)__vma_reservation_common(h, vma, addr, VMA_ABORT_RESV);
} }
static struct page *alloc_huge_page(struct vm_area_struct *vma, static struct page *alloc_huge_page(struct vm_area_struct *vma,
...@@ -1549,8 +1669,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, ...@@ -1549,8 +1669,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
if (chg < 0) if (chg < 0)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
if (chg || avoid_reserve) if (chg || avoid_reserve)
if (hugepage_subpool_get_pages(spool, 1) < 0) if (hugepage_subpool_get_pages(spool, 1) < 0) {
vma_abort_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC); return ERR_PTR(-ENOSPC);
}
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret) if (ret)
...@@ -1596,6 +1718,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, ...@@ -1596,6 +1718,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
out_subpool_put: out_subpool_put:
if (chg || avoid_reserve) if (chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1); hugepage_subpool_put_pages(spool, 1);
vma_abort_reservation(h, vma, addr);
return ERR_PTR(-ENOSPC); return ERR_PTR(-ENOSPC);
} }
...@@ -3236,11 +3359,14 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3236,11 +3359,14 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
* any allocations necessary to record that reservation occur outside * any allocations necessary to record that reservation occur outside
* the spinlock. * the spinlock.
*/ */
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
if (vma_needs_reservation(h, vma, address) < 0) { if (vma_needs_reservation(h, vma, address) < 0) {
ret = VM_FAULT_OOM; ret = VM_FAULT_OOM;
goto backout_unlocked; goto backout_unlocked;
} }
/* Just decrements count, does not deallocate */
vma_abort_reservation(h, vma, address);
}
ptl = huge_pte_lockptr(h, mm, ptep); ptl = huge_pte_lockptr(h, mm, ptep);
spin_lock(ptl); spin_lock(ptl);
...@@ -3387,6 +3513,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3387,6 +3513,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = VM_FAULT_OOM; ret = VM_FAULT_OOM;
goto out_mutex; goto out_mutex;
} }
/* Just decrements count, does not deallocate */
vma_abort_reservation(h, vma, address);
if (!(vma->vm_flags & VM_MAYSHARE)) if (!(vma->vm_flags & VM_MAYSHARE))
pagecache_page = hugetlbfs_pagecache_page(h, pagecache_page = hugetlbfs_pagecache_page(h,
...@@ -3726,6 +3854,8 @@ int hugetlb_reserve_pages(struct inode *inode, ...@@ -3726,6 +3854,8 @@ int hugetlb_reserve_pages(struct inode *inode,
} }
return 0; return 0;
out_err: out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
region_abort(resv_map, from, to);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release); kref_put(&resv_map->refs, resv_map_release);
return ret; return ret;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment