mm: Use multi-index entries in the page cache

We currently store large folios as 2^N consecutive entries.  While this
consumes rather more memory than necessary, it also turns out to be buggy.
A writeback operation which starts within a tail page of a dirty folio will
not write back the folio as the xarray's dirty bit is only set on the
head index.  With multi-index entries, the dirty bit will be found no
matter where in the folio the operation starts.

This does end up simplifying the page cache slightly, although not as
much as I had hoped.
Signed-off-by: default avatarMatthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: default avatarWilliam Kucharski <william.kucharski@oracle.com>
parent 25a8de7f
...@@ -1125,16 +1125,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac, ...@@ -1125,16 +1125,6 @@ static inline unsigned int __readahead_batch(struct readahead_control *rac,
VM_BUG_ON_PAGE(PageTail(page), page); VM_BUG_ON_PAGE(PageTail(page), page);
array[i++] = page; array[i++] = page;
rac->_batch_count += thp_nr_pages(page); rac->_batch_count += thp_nr_pages(page);
/*
* The page cache isn't using multi-index entries yet,
* so the xas cursor needs to be manually moved to the
* next index. This can be removed once the page cache
* is converted.
*/
if (PageHead(page))
xas_set(&xas, rac->_index + rac->_batch_count);
if (i == array_sz) if (i == array_sz)
break; break;
} }
......
...@@ -135,7 +135,6 @@ static void page_cache_delete(struct address_space *mapping, ...@@ -135,7 +135,6 @@ static void page_cache_delete(struct address_space *mapping,
} }
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(nr != 1 && shadow, folio);
xas_store(&xas, shadow); xas_store(&xas, shadow);
xas_init_marks(&xas); xas_init_marks(&xas);
...@@ -286,7 +285,7 @@ static void page_cache_delete_batch(struct address_space *mapping, ...@@ -286,7 +285,7 @@ static void page_cache_delete_batch(struct address_space *mapping,
struct folio_batch *fbatch) struct folio_batch *fbatch)
{ {
XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index); XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
int total_pages = 0; long total_pages = 0;
int i = 0; int i = 0;
struct folio *folio; struct folio *folio;
...@@ -313,18 +312,12 @@ static void page_cache_delete_batch(struct address_space *mapping, ...@@ -313,18 +312,12 @@ static void page_cache_delete_batch(struct address_space *mapping,
WARN_ON_ONCE(!folio_test_locked(folio)); WARN_ON_ONCE(!folio_test_locked(folio));
if (folio->index == xas.xa_index) folio->mapping = NULL;
folio->mapping = NULL;
/* Leave folio->index set: truncation lookup relies on it */ /* Leave folio->index set: truncation lookup relies on it */
/* i++;
* Move to the next folio in the batch if this is a regular
* folio or the index is of the last sub-page of this folio.
*/
if (folio->index + folio_nr_pages(folio) - 1 == xas.xa_index)
i++;
xas_store(&xas, NULL); xas_store(&xas, NULL);
total_pages++; total_pages += folio_nr_pages(folio);
} }
mapping->nrpages -= total_pages; mapping->nrpages -= total_pages;
} }
...@@ -2089,24 +2082,27 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start, ...@@ -2089,24 +2082,27 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
indices[fbatch->nr] = xas.xa_index; indices[fbatch->nr] = xas.xa_index;
if (!folio_batch_add(fbatch, folio)) if (!folio_batch_add(fbatch, folio))
break; break;
goto next; continue;
unlock: unlock:
folio_unlock(folio); folio_unlock(folio);
put: put:
folio_put(folio); folio_put(folio);
next:
if (!xa_is_value(folio) && folio_test_large(folio)) {
xas_set(&xas, folio->index + folio_nr_pages(folio));
/* Did we wrap on 32-bit? */
if (!xas.xa_index)
break;
}
} }
rcu_read_unlock(); rcu_read_unlock();
return folio_batch_count(fbatch); return folio_batch_count(fbatch);
} }
static inline
bool folio_more_pages(struct folio *folio, pgoff_t index, pgoff_t max)
{
if (!folio_test_large(folio) || folio_test_hugetlb(folio))
return false;
if (index >= max)
return false;
return index < folio->index + folio_nr_pages(folio) - 1;
}
/** /**
* find_get_pages_range - gang pagecache lookup * find_get_pages_range - gang pagecache lookup
* @mapping: The address_space to search * @mapping: The address_space to search
...@@ -2145,11 +2141,17 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, ...@@ -2145,11 +2141,17 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
if (xa_is_value(folio)) if (xa_is_value(folio))
continue; continue;
again:
pages[ret] = folio_file_page(folio, xas.xa_index); pages[ret] = folio_file_page(folio, xas.xa_index);
if (++ret == nr_pages) { if (++ret == nr_pages) {
*start = xas.xa_index + 1; *start = xas.xa_index + 1;
goto out; goto out;
} }
if (folio_more_pages(folio, xas.xa_index, end)) {
xas.xa_index++;
folio_ref_inc(folio);
goto again;
}
} }
/* /*
...@@ -2207,9 +2209,15 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, ...@@ -2207,9 +2209,15 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
if (unlikely(folio != xas_reload(&xas))) if (unlikely(folio != xas_reload(&xas)))
goto put_page; goto put_page;
pages[ret] = &folio->page; again:
pages[ret] = folio_file_page(folio, xas.xa_index);
if (++ret == nr_pages) if (++ret == nr_pages)
break; break;
if (folio_more_pages(folio, xas.xa_index, ULONG_MAX)) {
xas.xa_index++;
folio_ref_inc(folio);
goto again;
}
continue; continue;
put_page: put_page:
folio_put(folio); folio_put(folio);
...@@ -2334,8 +2342,7 @@ static void filemap_get_read_batch(struct address_space *mapping, ...@@ -2334,8 +2342,7 @@ static void filemap_get_read_batch(struct address_space *mapping,
break; break;
if (folio_test_readahead(folio)) if (folio_test_readahead(folio))
break; break;
xas.xa_index = folio->index + folio_nr_pages(folio) - 1; xas_advance(&xas, folio->index + folio_nr_pages(folio) - 1);
xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
continue; continue;
put_folio: put_folio:
folio_put(folio); folio_put(folio);
...@@ -3284,6 +3291,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, ...@@ -3284,6 +3291,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT); addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
do { do {
again:
page = folio_file_page(folio, xas.xa_index); page = folio_file_page(folio, xas.xa_index);
if (PageHWPoison(page)) if (PageHWPoison(page))
goto unlock; goto unlock;
...@@ -3305,9 +3313,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, ...@@ -3305,9 +3313,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
do_set_pte(vmf, page, addr); do_set_pte(vmf, page, addr);
/* no need to invalidate: a not-present page won't be cached */ /* no need to invalidate: a not-present page won't be cached */
update_mmu_cache(vma, addr, vmf->pte); update_mmu_cache(vma, addr, vmf->pte);
if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
xas.xa_index++;
folio_ref_inc(folio);
goto again;
}
folio_unlock(folio); folio_unlock(folio);
continue; continue;
unlock: unlock:
if (folio_more_pages(folio, xas.xa_index, end_pgoff)) {
xas.xa_index++;
goto again;
}
folio_unlock(folio); folio_unlock(folio);
folio_put(folio); folio_put(folio);
} while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL); } while ((folio = next_map_page(mapping, &xas, end_pgoff)) != NULL);
......
...@@ -2614,6 +2614,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2614,6 +2614,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{ {
struct page *head = compound_head(page); struct page *head = compound_head(page);
struct deferred_split *ds_queue = get_deferred_split_queue(head); struct deferred_split *ds_queue = get_deferred_split_queue(head);
XA_STATE(xas, &head->mapping->i_pages, head->index);
struct anon_vma *anon_vma = NULL; struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL; struct address_space *mapping = NULL;
int extra_pins, ret; int extra_pins, ret;
...@@ -2652,6 +2653,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2652,6 +2653,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out; goto out;
} }
xas_split_alloc(&xas, head, compound_order(head),
mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
if (xas_error(&xas)) {
ret = xas_error(&xas);
goto out;
}
anon_vma = NULL; anon_vma = NULL;
i_mmap_lock_read(mapping); i_mmap_lock_read(mapping);
...@@ -2681,13 +2689,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2681,13 +2689,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* block interrupt reentry in xa_lock and spinlock */ /* block interrupt reentry in xa_lock and spinlock */
local_irq_disable(); local_irq_disable();
if (mapping) { if (mapping) {
XA_STATE(xas, &mapping->i_pages, page_index(head));
/* /*
* Check if the head page is present in page cache. * Check if the head page is present in page cache.
* We assume all tail are present too, if head is there. * We assume all tail are present too, if head is there.
*/ */
xa_lock(&mapping->i_pages); xas_lock(&xas);
xas_reset(&xas);
if (xas_load(&xas) != head) if (xas_load(&xas) != head)
goto fail; goto fail;
} }
...@@ -2703,6 +2710,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2703,6 +2710,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping) { if (mapping) {
int nr = thp_nr_pages(head); int nr = thp_nr_pages(head);
xas_split(&xas, head, thp_order(head));
if (PageSwapBacked(head)) { if (PageSwapBacked(head)) {
__mod_lruvec_page_state(head, NR_SHMEM_THPS, __mod_lruvec_page_state(head, NR_SHMEM_THPS,
-nr); -nr);
...@@ -2719,7 +2727,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2719,7 +2727,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&ds_queue->split_queue_lock); spin_unlock(&ds_queue->split_queue_lock);
fail: fail:
if (mapping) if (mapping)
xa_unlock(&mapping->i_pages); xas_unlock(&xas);
local_irq_enable(); local_irq_enable();
remap_page(head, thp_nr_pages(head)); remap_page(head, thp_nr_pages(head));
ret = -EBUSY; ret = -EBUSY;
...@@ -2733,6 +2741,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) ...@@ -2733,6 +2741,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping) if (mapping)
i_mmap_unlock_read(mapping); i_mmap_unlock_read(mapping);
out: out:
/* Free any memory we didn't use */
xas_nomem(&xas, 0);
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED); count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret; return ret;
} }
......
...@@ -1667,7 +1667,10 @@ static void collapse_file(struct mm_struct *mm, ...@@ -1667,7 +1667,10 @@ static void collapse_file(struct mm_struct *mm,
} }
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/* This will be less messy when we use multi-index entries */ /*
* Ensure we have slots for all the pages in the range. This is
* almost certainly a no-op because most of the pages must be present
*/
do { do {
xas_lock_irq(&xas); xas_lock_irq(&xas);
xas_create_range(&xas); xas_create_range(&xas);
...@@ -1892,6 +1895,9 @@ static void collapse_file(struct mm_struct *mm, ...@@ -1892,6 +1895,9 @@ static void collapse_file(struct mm_struct *mm,
__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
} }
/* Join all the small entries into a single multi-index entry */
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
xas_store(&xas, new_page);
xa_locked: xa_locked:
xas_unlock_irq(&xas); xas_unlock_irq(&xas);
xa_unlocked: xa_unlocked:
...@@ -2013,6 +2019,10 @@ static void khugepaged_scan_file(struct mm_struct *mm, ...@@ -2013,6 +2019,10 @@ static void khugepaged_scan_file(struct mm_struct *mm,
continue; continue;
} }
/*
* XXX: khugepaged should compact smaller compound pages
* into a PMD sized page
*/
if (PageTransCompound(page)) { if (PageTransCompound(page)) {
result = SCAN_PAGE_COMPOUND; result = SCAN_PAGE_COMPOUND;
break; break;
......
...@@ -433,14 +433,6 @@ int folio_migrate_mapping(struct address_space *mapping, ...@@ -433,14 +433,6 @@ int folio_migrate_mapping(struct address_space *mapping,
} }
xas_store(&xas, newfolio); xas_store(&xas, newfolio);
if (nr > 1) {
int i;
for (i = 1; i < nr; i++) {
xas_next(&xas);
xas_store(&xas, newfolio);
}
}
/* /*
* Drop cache reference from old page by unfreezing * Drop cache reference from old page by unfreezing
......
...@@ -694,7 +694,6 @@ static int shmem_add_to_page_cache(struct page *page, ...@@ -694,7 +694,6 @@ static int shmem_add_to_page_cache(struct page *page,
struct mm_struct *charge_mm) struct mm_struct *charge_mm)
{ {
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
unsigned long i = 0;
unsigned long nr = compound_nr(page); unsigned long nr = compound_nr(page);
int error; int error;
...@@ -721,20 +720,18 @@ static int shmem_add_to_page_cache(struct page *page, ...@@ -721,20 +720,18 @@ static int shmem_add_to_page_cache(struct page *page,
cgroup_throttle_swaprate(page, gfp); cgroup_throttle_swaprate(page, gfp);
do { do {
void *entry;
xas_lock_irq(&xas); xas_lock_irq(&xas);
entry = xas_find_conflict(&xas); if (expected != xas_find_conflict(&xas)) {
if (entry != expected) xas_set_err(&xas, -EEXIST);
goto unlock;
}
if (expected && xas_find_conflict(&xas)) {
xas_set_err(&xas, -EEXIST); xas_set_err(&xas, -EEXIST);
xas_create_range(&xas);
if (xas_error(&xas))
goto unlock; goto unlock;
next:
xas_store(&xas, page);
if (++i < nr) {
xas_next(&xas);
goto next;
} }
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
if (PageTransHuge(page)) { if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC); count_vm_event(THP_FILE_ALLOC);
__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr); __mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment