filemap.c 120 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
Linus Torvalds's avatar
Linus Torvalds committed
2 3 4 5 6 7 8 9 10 11 12
/*
 *	linux/mm/filemap.c
 *
 * Copyright (C) 1994-1999  Linus Torvalds
 */

/*
 * This file handles the generic file mmap semantics used by
 * most "normal" filesystems (but you don't /have/ to use this:
 * the NFS filesystem used to do this differently, for example)
 */
13
#include <linux/export.h>
Linus Torvalds's avatar
Linus Torvalds committed
14
#include <linux/compiler.h>
15
#include <linux/dax.h>
Linus Torvalds's avatar
Linus Torvalds committed
16
#include <linux/fs.h>
17
#include <linux/sched/signal.h>
18
#include <linux/uaccess.h>
19
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
20
#include <linux/kernel_stat.h>
21
#include <linux/gfp.h>
Linus Torvalds's avatar
Linus Torvalds committed
22 23
#include <linux/mm.h>
#include <linux/swap.h>
24
#include <linux/swapops.h>
25
#include <linux/syscalls.h>
Linus Torvalds's avatar
Linus Torvalds committed
26 27 28 29
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
30
#include <linux/error-injection.h>
Linus Torvalds's avatar
Linus Torvalds committed
31 32
#include <linux/hash.h>
#include <linux/writeback.h>
33
#include <linux/backing-dev.h>
Linus Torvalds's avatar
Linus Torvalds committed
34 35
#include <linux/pagevec.h>
#include <linux/security.h>
36
#include <linux/cpuset.h>
37
#include <linux/hugetlb.h>
38
#include <linux/memcontrol.h>
39
#include <linux/shmem_fs.h>
40
#include <linux/rmap.h>
41
#include <linux/delayacct.h>
42
#include <linux/psi.h>
43
#include <linux/ramfs.h>
44
#include <linux/page_idle.h>
45
#include <linux/migrate.h>
46 47
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
48
#include <asm/pgalloc.h>
49
#include <asm/tlbflush.h>
50 51
#include "internal.h"

52 53 54
#define CREATE_TRACE_POINTS
#include <trace/events/filemap.h>

Linus Torvalds's avatar
Linus Torvalds committed
55 56 57
/*
 * FIXME: remove all knowledge of the buffer layer from the core VM
 */
58
#include <linux/buffer_head.h> /* for try_to_free_buffers */
Linus Torvalds's avatar
Linus Torvalds committed
59 60 61

#include <asm/mman.h>

62 63
#include "swap.h"

Linus Torvalds's avatar
Linus Torvalds committed
64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
/*
 * Shared mappings implemented 30.11.1994. It's not fully working yet,
 * though.
 *
 * Shared mappings now work. 15.8.1995  Bruno.
 *
 * finished 'unifying' the page and buffer cache and SMP-threaded the
 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
 *
 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
 */

/*
 * Lock ordering:
 *
79
 *  ->i_mmap_rwsem		(truncate_pagecache)
80
 *    ->private_lock		(__free_pte->block_dirty_folio)
81
 *      ->swap_lock		(exclusive_swap_page, others)
Matthew Wilcox's avatar
Matthew Wilcox committed
82
 *        ->i_pages lock
Linus Torvalds's avatar
Linus Torvalds committed
83
 *
84
 *  ->i_rwsem
85 86
 *    ->invalidate_lock		(acquired by fs in truncate path)
 *      ->i_mmap_rwsem		(truncate->unmap_mapping_range)
Linus Torvalds's avatar
Linus Torvalds committed
87
 *
88
 *  ->mmap_lock
89
 *    ->i_mmap_rwsem
90
 *      ->page_table_lock or pte_lock	(various, mainly in memory.c)
Matthew Wilcox's avatar
Matthew Wilcox committed
91
 *        ->i_pages lock	(arch-dependent flush_dcache_mmap_lock)
Linus Torvalds's avatar
Linus Torvalds committed
92
 *
93
 *  ->mmap_lock
94 95
 *    ->invalidate_lock		(filemap_fault)
 *      ->lock_page		(filemap_fault, access_process_vm)
Linus Torvalds's avatar
Linus Torvalds committed
96
 *
97
 *  ->i_rwsem			(generic_perform_write)
98
 *    ->mmap_lock		(fault_in_readable->do_page_fault)
Linus Torvalds's avatar
Linus Torvalds committed
99
 *
100
 *  bdi->wb.list_lock
101
 *    sb_lock			(fs/fs-writeback.c)
Matthew Wilcox's avatar
Matthew Wilcox committed
102
 *    ->i_pages lock		(__sync_single_inode)
Linus Torvalds's avatar
Linus Torvalds committed
103
 *
104
 *  ->i_mmap_rwsem
105
 *    ->anon_vma.lock		(vma_merge)
Linus Torvalds's avatar
Linus Torvalds committed
106 107
 *
 *  ->anon_vma.lock
108
 *    ->page_table_lock or pte_lock	(anon_vma_prepare and various)
Linus Torvalds's avatar
Linus Torvalds committed
109
 *
110
 *  ->page_table_lock or pte_lock
111
 *    ->swap_lock		(try_to_unmap_one)
Linus Torvalds's avatar
Linus Torvalds committed
112
 *    ->private_lock		(try_to_unmap_one)
Matthew Wilcox's avatar
Matthew Wilcox committed
113
 *    ->i_pages lock		(try_to_unmap_one)
114 115
 *    ->lruvec->lru_lock	(follow_page->mark_page_accessed)
 *    ->lruvec->lru_lock	(check_pte_range->isolate_lru_page)
Linus Torvalds's avatar
Linus Torvalds committed
116
 *    ->private_lock		(page_remove_rmap->set_page_dirty)
Matthew Wilcox's avatar
Matthew Wilcox committed
117
 *    ->i_pages lock		(page_remove_rmap->set_page_dirty)
118
 *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
119
 *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
120
 *    ->memcg->move_lock	(page_remove_rmap->folio_memcg_lock)
121
 *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
122
 *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
123
 *    ->private_lock		(zap_pte_range->block_dirty_folio)
Linus Torvalds's avatar
Linus Torvalds committed
124 125
 */

126
static void page_cache_delete(struct address_space *mapping,
127
				   struct folio *folio, void *shadow)
128
{
129 130
	XA_STATE(xas, &mapping->i_pages, folio->index);
	long nr = 1;
131

132
	mapping_set_update(&xas, mapping);
133

134 135
	xas_set_order(&xas, folio->index, folio_order(folio));
	nr = folio_nr_pages(folio);
136

137
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
138

139 140
	xas_store(&xas, shadow);
	xas_init_marks(&xas);
141

142
	folio->mapping = NULL;
143
	/* Leave page->index set: truncation lookup relies upon it */
144
	mapping->nrpages -= nr;
145 146
}

147 148
static void filemap_unaccount_folio(struct address_space *mapping,
		struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
149
{
150
	long nr;
Linus Torvalds's avatar
Linus Torvalds committed
151

152 153
	VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
	if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
154
		pr_alert("BUG: Bad page cache in process %s  pfn:%05lx\n",
155 156
			 current->comm, folio_pfn(folio));
		dump_page(&folio->page, "still mapped when deleted");
157 158 159
		dump_stack();
		add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);

160 161 162 163 164 165 166 167 168 169 170 171 172
		if (mapping_exiting(mapping) && !folio_test_large(folio)) {
			int mapcount = page_mapcount(&folio->page);

			if (folio_ref_count(folio) >= mapcount + 2) {
				/*
				 * All vmas have already been torn down, so it's
				 * a good bet that actually the page is unmapped
				 * and we'd rather not leak it: if we're wrong,
				 * another bad page check should catch it later.
				 */
				page_mapcount_reset(&folio->page);
				folio_ref_sub(folio, mapcount);
			}
173 174 175
		}
	}

176 177
	/* hugetlb folios do not participate in page cache accounting. */
	if (folio_test_hugetlb(folio))
178
		return;
179

180
	nr = folio_nr_pages(folio);
181

182 183 184 185 186 187 188
	__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
	if (folio_test_swapbacked(folio)) {
		__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
		if (folio_test_pmd_mappable(folio))
			__lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr);
	} else if (folio_test_pmd_mappable(folio)) {
		__lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr);
189
		filemap_nr_thps_dec(mapping);
190
	}
191 192

	/*
193 194
	 * At this point folio must be either written or cleaned by
	 * truncate.  Dirty folio here signals a bug and loss of
195
	 * unwritten data - on ordinary filesystems.
196
	 *
197 198 199 200 201
	 * But it's harmless on in-memory filesystems like tmpfs; and can
	 * occur when a driver which did get_user_pages() sets page dirty
	 * before putting it, while the inode is being finally evicted.
	 *
	 * Below fixes dirty accounting after removing the folio entirely
202 203
	 * but leaves the dirty flag set: it has no effect for truncated
	 * folio and anyway will be cleared before returning folio to
204 205
	 * buddy allocator.
	 */
206 207 208
	if (WARN_ON_ONCE(folio_test_dirty(folio) &&
			 mapping_can_writeback(mapping)))
		folio_account_cleaned(folio, inode_to_wb(mapping->host));
209 210 211 212 213
}

/*
 * Delete a page from the page cache and free it. Caller has to make
 * sure the page is locked and that nobody else uses it - or that usage
Matthew Wilcox's avatar
Matthew Wilcox committed
214
 * is safe.  The caller must hold the i_pages lock.
215
 */
216
void __filemap_remove_folio(struct folio *folio, void *shadow)
217
{
218
	struct address_space *mapping = folio->mapping;
219

220
	trace_mm_filemap_delete_from_page_cache(folio);
221
	filemap_unaccount_folio(mapping, folio);
222
	page_cache_delete(mapping, folio, shadow);
Linus Torvalds's avatar
Linus Torvalds committed
223 224
}

225
void filemap_free_folio(struct address_space *mapping, struct folio *folio)
226
{
227
	void (*free_folio)(struct folio *);
228
	int refs = 1;
229

230 231 232
	free_folio = mapping->a_ops->free_folio;
	if (free_folio)
		free_folio(folio);
233

234
	if (folio_test_large(folio))
235 236
		refs = folio_nr_pages(folio);
	folio_put_refs(folio, refs);
237 238
}

239
/**
240 241
 * filemap_remove_folio - Remove folio from page cache.
 * @folio: The folio.
242
 *
243 244 245
 * This must be called only on folios that are locked and have been
 * verified to be in the page cache.  It will never put the folio into
 * the free list because the caller has a reference on the page.
246
 */
247
void filemap_remove_folio(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
248
{
249
	struct address_space *mapping = folio->mapping;
Linus Torvalds's avatar
Linus Torvalds committed
250

251
	BUG_ON(!folio_test_locked(folio));
252
	spin_lock(&mapping->host->i_lock);
253
	xa_lock_irq(&mapping->i_pages);
254
	__filemap_remove_folio(folio, NULL);
255
	xa_unlock_irq(&mapping->i_pages);
256 257 258
	if (mapping_shrinkable(mapping))
		inode_add_lru(mapping->host);
	spin_unlock(&mapping->host->i_lock);
259

260
	filemap_free_folio(mapping, folio);
261 262
}

263
/*
264 265 266
 * page_cache_delete_batch - delete several folios from page cache
 * @mapping: the mapping to which folios belong
 * @fbatch: batch of folios to delete
267
 *
268 269 270 271 272
 * The function walks over mapping->i_pages and removes folios passed in
 * @fbatch from the mapping. The function expects @fbatch to be sorted
 * by page index and is optimised for it to be dense.
 * It tolerates holes in @fbatch (mapping entries at those indices are not
 * modified).
273
 *
Matthew Wilcox's avatar
Matthew Wilcox committed
274
 * The function expects the i_pages lock to be held.
275
 */
276
static void page_cache_delete_batch(struct address_space *mapping,
277
			     struct folio_batch *fbatch)
278
{
279
	XA_STATE(xas, &mapping->i_pages, fbatch->folios[0]->index);
280
	long total_pages = 0;
281
	int i = 0;
282
	struct folio *folio;
283

284
	mapping_set_update(&xas, mapping);
285
	xas_for_each(&xas, folio, ULONG_MAX) {
286
		if (i >= folio_batch_count(fbatch))
287
			break;
288 289

		/* A swap/dax/shadow entry got inserted? Skip it. */
290
		if (xa_is_value(folio))
291
			continue;
292 293 294 295 296 297 298
		/*
		 * A page got inserted in our range? Skip it. We have our
		 * pages locked so they are protected from being removed.
		 * If we see a page whose index is higher than ours, it
		 * means our page has been removed, which shouldn't be
		 * possible because we're holding the PageLock.
		 */
299
		if (folio != fbatch->folios[i]) {
300
			VM_BUG_ON_FOLIO(folio->index >
301
					fbatch->folios[i]->index, folio);
302 303 304
			continue;
		}

305
		WARN_ON_ONCE(!folio_test_locked(folio));
306

307
		folio->mapping = NULL;
308
		/* Leave folio->index set: truncation lookup relies on it */
309

310
		i++;
311
		xas_store(&xas, NULL);
312
		total_pages += folio_nr_pages(folio);
313 314 315 316 317
	}
	mapping->nrpages -= total_pages;
}

void delete_from_page_cache_batch(struct address_space *mapping,
318
				  struct folio_batch *fbatch)
319 320 321
{
	int i;

322
	if (!folio_batch_count(fbatch))
323 324
		return;

325
	spin_lock(&mapping->host->i_lock);
326
	xa_lock_irq(&mapping->i_pages);
327 328
	for (i = 0; i < folio_batch_count(fbatch); i++) {
		struct folio *folio = fbatch->folios[i];
329

330 331
		trace_mm_filemap_delete_from_page_cache(folio);
		filemap_unaccount_folio(mapping, folio);
332
	}
333
	page_cache_delete_batch(mapping, fbatch);
334
	xa_unlock_irq(&mapping->i_pages);
335 336 337
	if (mapping_shrinkable(mapping))
		inode_add_lru(mapping->host);
	spin_unlock(&mapping->host->i_lock);
338

339 340
	for (i = 0; i < folio_batch_count(fbatch); i++)
		filemap_free_folio(mapping, fbatch->folios[i]);
341 342
}

343
int filemap_check_errors(struct address_space *mapping)
344 345 346
{
	int ret = 0;
	/* Check for outstanding write errors */
347 348
	if (test_bit(AS_ENOSPC, &mapping->flags) &&
	    test_and_clear_bit(AS_ENOSPC, &mapping->flags))
349
		ret = -ENOSPC;
350 351
	if (test_bit(AS_EIO, &mapping->flags) &&
	    test_and_clear_bit(AS_EIO, &mapping->flags))
352 353 354
		ret = -EIO;
	return ret;
}
355
EXPORT_SYMBOL(filemap_check_errors);
356

357 358 359 360 361 362 363 364 365 366
static int filemap_check_and_keep_errors(struct address_space *mapping)
{
	/* Check for outstanding write errors */
	if (test_bit(AS_EIO, &mapping->flags))
		return -EIO;
	if (test_bit(AS_ENOSPC, &mapping->flags))
		return -ENOSPC;
	return 0;
}

367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
/**
 * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
 * @mapping:	address space structure to write
 * @wbc:	the writeback_control controlling the writeout
 *
 * Call writepages on the mapping using the provided wbc to control the
 * writeout.
 *
 * Return: %0 on success, negative error code otherwise.
 */
int filemap_fdatawrite_wbc(struct address_space *mapping,
			   struct writeback_control *wbc)
{
	int ret;

	if (!mapping_can_writeback(mapping) ||
	    !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
		return 0;

	wbc_attach_fdatawrite_inode(wbc, mapping->host);
	ret = do_writepages(mapping, wbc);
	wbc_detach_inode(wbc);
	return ret;
}
EXPORT_SYMBOL(filemap_fdatawrite_wbc);

Linus Torvalds's avatar
Linus Torvalds committed
393
/**
394
 * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
395 396
 * @mapping:	address space structure to write
 * @start:	offset in bytes where the range starts
397
 * @end:	offset in bytes where the range ends (inclusive)
398
 * @sync_mode:	enable synchronous operation
Linus Torvalds's avatar
Linus Torvalds committed
399
 *
400 401 402
 * Start writeback against all of a mapping's dirty pages that lie
 * within the byte offsets <start, end> inclusive.
 *
Linus Torvalds's avatar
Linus Torvalds committed
403
 * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
404
 * opposed to a regular memory cleansing writeback.  The difference between
Linus Torvalds's avatar
Linus Torvalds committed
405 406
 * these two operations is that if a dirty page/buffer is encountered, it must
 * be waited upon, and not just skipped over.
407 408
 *
 * Return: %0 on success, negative error code otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
409
 */
410 411
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
				loff_t end, int sync_mode)
Linus Torvalds's avatar
Linus Torvalds committed
412 413 414
{
	struct writeback_control wbc = {
		.sync_mode = sync_mode,
415
		.nr_to_write = LONG_MAX,
416 417
		.range_start = start,
		.range_end = end,
Linus Torvalds's avatar
Linus Torvalds committed
418 419
	};

420
	return filemap_fdatawrite_wbc(mapping, &wbc);
Linus Torvalds's avatar
Linus Torvalds committed
421 422 423 424 425
}

static inline int __filemap_fdatawrite(struct address_space *mapping,
	int sync_mode)
{
426
	return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
Linus Torvalds's avatar
Linus Torvalds committed
427 428 429 430 431 432 433 434
}

int filemap_fdatawrite(struct address_space *mapping)
{
	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
EXPORT_SYMBOL(filemap_fdatawrite);

435
int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
436
				loff_t end)
Linus Torvalds's avatar
Linus Torvalds committed
437 438 439
{
	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
}
440
EXPORT_SYMBOL(filemap_fdatawrite_range);
Linus Torvalds's avatar
Linus Torvalds committed
441

442 443 444 445
/**
 * filemap_flush - mostly a non-blocking flush
 * @mapping:	target address_space
 *
Linus Torvalds's avatar
Linus Torvalds committed
446 447
 * This is a mostly non-blocking flush.  Not suitable for data-integrity
 * purposes - I/O may not be started against all dirty pages.
448 449
 *
 * Return: %0 on success, negative error code otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
450 451 452 453 454 455 456
 */
int filemap_flush(struct address_space *mapping)
{
	return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
}
EXPORT_SYMBOL(filemap_flush);

457 458 459 460 461 462 463 464
/**
 * filemap_range_has_page - check if a page exists in range.
 * @mapping:           address space within which to check
 * @start_byte:        offset in bytes where the range starts
 * @end_byte:          offset in bytes where the range ends (inclusive)
 *
 * Find at least one page in the range supplied, usually used to check if
 * direct writing in this range will trigger a writeback.
465 466 467
 *
 * Return: %true if at least one page exists in the specified range,
 * %false otherwise.
468 469 470 471
 */
bool filemap_range_has_page(struct address_space *mapping,
			   loff_t start_byte, loff_t end_byte)
{
472
	struct folio *folio;
473 474
	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
	pgoff_t max = end_byte >> PAGE_SHIFT;
475 476 477 478

	if (end_byte < start_byte)
		return false;

479 480
	rcu_read_lock();
	for (;;) {
481 482
		folio = xas_find(&xas, max);
		if (xas_retry(&xas, folio))
483 484
			continue;
		/* Shadow entries don't count */
485
		if (xa_is_value(folio))
486 487 488 489 490 491 492 493 494
			continue;
		/*
		 * We don't need to try to pin this page; we're about to
		 * release the RCU lock anyway.  It is enough to know that
		 * there was a page here recently.
		 */
		break;
	}
	rcu_read_unlock();
495

496
	return folio != NULL;
497 498 499
}
EXPORT_SYMBOL(filemap_range_has_page);

500
static void __filemap_fdatawait_range(struct address_space *mapping,
501
				     loff_t start_byte, loff_t end_byte)
Linus Torvalds's avatar
Linus Torvalds committed
502
{
503 504
	pgoff_t index = start_byte >> PAGE_SHIFT;
	pgoff_t end = end_byte >> PAGE_SHIFT;
505 506 507 508
	struct folio_batch fbatch;
	unsigned nr_folios;

	folio_batch_init(&fbatch);
Linus Torvalds's avatar
Linus Torvalds committed
509

510
	while (index <= end) {
Linus Torvalds's avatar
Linus Torvalds committed
511 512
		unsigned i;

513 514 515 516
		nr_folios = filemap_get_folios_tag(mapping, &index, end,
				PAGECACHE_TAG_WRITEBACK, &fbatch);

		if (!nr_folios)
517 518
			break;

519 520
		for (i = 0; i < nr_folios; i++) {
			struct folio *folio = fbatch.folios[i];
Linus Torvalds's avatar
Linus Torvalds committed
521

522 523
			folio_wait_writeback(folio);
			folio_clear_error(folio);
Linus Torvalds's avatar
Linus Torvalds committed
524
		}
525
		folio_batch_release(&fbatch);
Linus Torvalds's avatar
Linus Torvalds committed
526 527
		cond_resched();
	}
528 529 530 531 532 533 534 535 536 537 538 539 540 541 542
}

/**
 * filemap_fdatawait_range - wait for writeback to complete
 * @mapping:		address space structure to wait for
 * @start_byte:		offset in bytes where the range starts
 * @end_byte:		offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space
 * in the given range and wait for all of them.  Check error status of
 * the address space and return it.
 *
 * Since the error status of the address space is cleared by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
543 544
 *
 * Return: error status of the address space.
545 546 547 548
 */
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
			    loff_t end_byte)
{
549 550
	__filemap_fdatawait_range(mapping, start_byte, end_byte);
	return filemap_check_errors(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
551
}
552 553
EXPORT_SYMBOL(filemap_fdatawait_range);

554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
/**
 * filemap_fdatawait_range_keep_errors - wait for writeback to complete
 * @mapping:		address space structure to wait for
 * @start_byte:		offset in bytes where the range starts
 * @end_byte:		offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the given address space in the
 * given range and wait for all of them.  Unlike filemap_fdatawait_range(),
 * this function does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
 */
int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
		loff_t start_byte, loff_t end_byte)
{
	__filemap_fdatawait_range(mapping, start_byte, end_byte);
	return filemap_check_and_keep_errors(mapping);
}
EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);

576 577 578 579 580 581 582 583 584 585 586 587 588
/**
 * file_fdatawait_range - wait for writeback to complete
 * @file:		file pointing to address space structure to wait for
 * @start_byte:		offset in bytes where the range starts
 * @end_byte:		offset in bytes where the range ends (inclusive)
 *
 * Walk the list of under-writeback pages of the address space that file
 * refers to, in the given range and wait for all of them.  Check error
 * status of the address space vs. the file->f_wb_err cursor and return it.
 *
 * Since the error status of the file is advanced by this function,
 * callers are responsible for checking the return value and handling and/or
 * reporting the error.
589 590
 *
 * Return: error status of the address space vs. the file->f_wb_err cursor.
591 592 593 594 595 596 597 598 599
 */
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
	struct address_space *mapping = file->f_mapping;

	__filemap_fdatawait_range(mapping, start_byte, end_byte);
	return file_check_and_advance_wb_err(file);
}
EXPORT_SYMBOL(file_fdatawait_range);
600

601 602 603 604 605 606 607 608 609 610 611
/**
 * filemap_fdatawait_keep_errors - wait for writeback without clearing errors
 * @mapping: address space structure to wait for
 *
 * Walk the list of under-writeback pages of the given address space
 * and wait for all of them.  Unlike filemap_fdatawait(), this function
 * does not clear error status of the address space.
 *
 * Use this function if callers don't handle errors themselves.  Expected
 * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
 * fsfreeze(8)
612 613
 *
 * Return: error status of the address space.
614
 */
615
int filemap_fdatawait_keep_errors(struct address_space *mapping)
616
{
617
	__filemap_fdatawait_range(mapping, 0, LLONG_MAX);
618
	return filemap_check_and_keep_errors(mapping);
619
}
620
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
621

622
/* Returns true if writeback might be needed or already in progress. */
623
static bool mapping_needs_writeback(struct address_space *mapping)
Linus Torvalds's avatar
Linus Torvalds committed
624
{
625
	return mapping->nrpages;
Linus Torvalds's avatar
Linus Torvalds committed
626 627
}

628 629
bool filemap_range_has_writeback(struct address_space *mapping,
				 loff_t start_byte, loff_t end_byte)
630 631 632
{
	XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
	pgoff_t max = end_byte >> PAGE_SHIFT;
633
	struct folio *folio;
634 635 636 637 638

	if (end_byte < start_byte)
		return false;

	rcu_read_lock();
639 640
	xas_for_each(&xas, folio, max) {
		if (xas_retry(&xas, folio))
641
			continue;
642
		if (xa_is_value(folio))
643
			continue;
644 645
		if (folio_test_dirty(folio) || folio_test_locked(folio) ||
				folio_test_writeback(folio))
646 647 648
			break;
	}
	rcu_read_unlock();
649
	return folio != NULL;
650
}
651
EXPORT_SYMBOL_GPL(filemap_range_has_writeback);
652

653 654 655 656 657 658
/**
 * filemap_write_and_wait_range - write out & wait on a file range
 * @mapping:	the address_space for the pages
 * @lstart:	offset in bytes where the range starts
 * @lend:	offset in bytes where the range ends (inclusive)
 *
659 660
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
661
 * Note that @lend is inclusive (describes the last byte to be written) so
662
 * that this function can be used to write to the very end-of-file (end = -1).
663 664
 *
 * Return: error status of the address space.
665
 */
Linus Torvalds's avatar
Linus Torvalds committed
666 667 668
int filemap_write_and_wait_range(struct address_space *mapping,
				 loff_t lstart, loff_t lend)
{
669
	int err = 0, err2;
Linus Torvalds's avatar
Linus Torvalds committed
670

671 672 673
	if (lend < lstart)
		return 0;

674
	if (mapping_needs_writeback(mapping)) {
675 676
		err = __filemap_fdatawrite_range(mapping, lstart, lend,
						 WB_SYNC_ALL);
677 678 679 680 681 682
		/*
		 * Even if the above returned error, the pages may be
		 * written partially (e.g. -ENOSPC), so we wait for it.
		 * But the -EIO is special case, it may indicate the worst
		 * thing (e.g. bug) happened, so we avoid waiting for it.
		 */
683 684
		if (err != -EIO)
			__filemap_fdatawait_range(mapping, lstart, lend);
Linus Torvalds's avatar
Linus Torvalds committed
685
	}
686 687 688
	err2 = filemap_check_errors(mapping);
	if (!err)
		err = err2;
689
	return err;
Linus Torvalds's avatar
Linus Torvalds committed
690
}
691
EXPORT_SYMBOL(filemap_write_and_wait_range);
Linus Torvalds's avatar
Linus Torvalds committed
692

693 694
void __filemap_set_wb_err(struct address_space *mapping, int err)
{
695
	errseq_t eseq = errseq_set(&mapping->wb_err, err);
696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721

	trace_filemap_set_wb_err(mapping, eseq);
}
EXPORT_SYMBOL(__filemap_set_wb_err);

/**
 * file_check_and_advance_wb_err - report wb error (if any) that was previously
 * 				   and advance wb_err to current one
 * @file: struct file on which the error is being reported
 *
 * When userland calls fsync (or something like nfsd does the equivalent), we
 * want to report any writeback errors that occurred since the last fsync (or
 * since the file was opened if there haven't been any).
 *
 * Grab the wb_err from the mapping. If it matches what we have in the file,
 * then just quickly return 0. The file is all caught up.
 *
 * If it doesn't match, then take the mapping value, set the "seen" flag in
 * it and try to swap it into place. If it works, or another task beat us
 * to it with the new value, then update the f_wb_err and return the error
 * portion. The error at this point must be reported via proper channels
 * (a'la fsync, or NFS COMMIT operation, etc.).
 *
 * While we handle mapping->wb_err with atomic operations, the f_wb_err
 * value is protected by the f_lock since we must ensure that it reflects
 * the latest value swapped in for this file descriptor.
722 723
 *
 * Return: %0 on success, negative error code otherwise.
724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740
 */
int file_check_and_advance_wb_err(struct file *file)
{
	int err = 0;
	errseq_t old = READ_ONCE(file->f_wb_err);
	struct address_space *mapping = file->f_mapping;

	/* Locklessly handle the common case where nothing has changed */
	if (errseq_check(&mapping->wb_err, old)) {
		/* Something changed, must use slow path */
		spin_lock(&file->f_lock);
		old = file->f_wb_err;
		err = errseq_check_and_advance(&mapping->wb_err,
						&file->f_wb_err);
		trace_file_check_and_advance_wb_err(file, old);
		spin_unlock(&file->f_lock);
	}
741 742 743 744 745 746 747 748

	/*
	 * We're mostly using this function as a drop in replacement for
	 * filemap_check_errors. Clear AS_EIO/AS_ENOSPC to emulate the effect
	 * that the legacy code would have had on these flags.
	 */
	clear_bit(AS_EIO, &mapping->flags);
	clear_bit(AS_ENOSPC, &mapping->flags);
749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765
	return err;
}
EXPORT_SYMBOL(file_check_and_advance_wb_err);

/**
 * file_write_and_wait_range - write out & wait on a file range
 * @file:	file pointing to address_space with pages
 * @lstart:	offset in bytes where the range starts
 * @lend:	offset in bytes where the range ends (inclusive)
 *
 * Write out and wait upon file offsets lstart->lend, inclusive.
 *
 * Note that @lend is inclusive (describes the last byte to be written) so
 * that this function can be used to write to the very end-of-file (end = -1).
 *
 * After writing out and waiting on the data, we check and advance the
 * f_wb_err cursor to the latest value, and return any errors detected there.
766 767
 *
 * Return: %0 on success, negative error code otherwise.
768 769 770 771 772 773
 */
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
	int err = 0, err2;
	struct address_space *mapping = file->f_mapping;

774 775 776
	if (lend < lstart)
		return 0;

777
	if (mapping_needs_writeback(mapping)) {
778 779 780 781 782 783 784 785 786 787 788 789 790
		err = __filemap_fdatawrite_range(mapping, lstart, lend,
						 WB_SYNC_ALL);
		/* See comment of filemap_write_and_wait() */
		if (err != -EIO)
			__filemap_fdatawait_range(mapping, lstart, lend);
	}
	err2 = file_check_and_advance_wb_err(file);
	if (!err)
		err = err2;
	return err;
}
EXPORT_SYMBOL(file_write_and_wait_range);

791
/**
792 793 794 795 796 797 798 799
 * replace_page_cache_folio - replace a pagecache folio with a new one
 * @old:	folio to be replaced
 * @new:	folio to replace with
 *
 * This function replaces a folio in the pagecache with a new one.  On
 * success it acquires the pagecache reference for the new folio and
 * drops it for the old folio.  Both the old and new folios must be
 * locked.  This function does not add the new folio to the LRU, the
800 801
 * caller must do that.
 *
802
 * The remove + add is atomic.  This function cannot fail.
803
 */
804
void replace_page_cache_folio(struct folio *old, struct folio *new)
805
{
806
	struct address_space *mapping = old->mapping;
807
	void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
808 809
	pgoff_t offset = old->index;
	XA_STATE(xas, &mapping->i_pages, offset);
810

811 812 813
	VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
	VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
	VM_BUG_ON_FOLIO(new->mapping, new);
814

815
	folio_get(new);
816 817
	new->mapping = mapping;
	new->index = offset;
818

819
	mem_cgroup_replace_folio(old, new);
820

821
	xas_lock_irq(&xas);
822
	xas_store(&xas, new);
823

824 825
	old->mapping = NULL;
	/* hugetlb pages do not participate in page cache accounting. */
826 827 828 829 830 831 832 833
	if (!folio_test_hugetlb(old))
		__lruvec_stat_sub_folio(old, NR_FILE_PAGES);
	if (!folio_test_hugetlb(new))
		__lruvec_stat_add_folio(new, NR_FILE_PAGES);
	if (folio_test_swapbacked(old))
		__lruvec_stat_sub_folio(old, NR_SHMEM);
	if (folio_test_swapbacked(new))
		__lruvec_stat_add_folio(new, NR_SHMEM);
834
	xas_unlock_irq(&xas);
835
	if (free_folio)
836 837
		free_folio(old);
	folio_put(old);
838
}
839
EXPORT_SYMBOL_GPL(replace_page_cache_folio);
840

841 842
noinline int __filemap_add_folio(struct address_space *mapping,
		struct folio *folio, pgoff_t index, gfp_t gfp, void **shadowp)
Linus Torvalds's avatar
Linus Torvalds committed
843
{
844 845
	XA_STATE(xas, &mapping->i_pages, index);
	int huge = folio_test_hugetlb(folio);
846
	bool charged = false;
847
	long nr = 1;
848

849 850
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
851
	mapping_set_update(&xas, mapping);
852

853
	if (!huge) {
854
		int error = mem_cgroup_charge(folio, NULL, gfp);
855
		if (error)
856
			return error;
857
		charged = true;
858 859
	}

860 861 862 863
	VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
	xas_set_order(&xas, index, folio_order(folio));
	nr = folio_nr_pages(folio);

864
	gfp &= GFP_RECLAIM_MASK;
865 866 867
	folio_ref_add(folio, nr);
	folio->mapping = mapping;
	folio->index = xas.xa_index;
868

869
	do {
870 871 872
		unsigned int order = xa_get_order(xas.xa, xas.xa_index);
		void *entry, *old = NULL;

873
		if (order > folio_order(folio))
874 875
			xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
					order, gfp);
876
		xas_lock_irq(&xas);
877 878 879 880 881 882 883 884 885 886 887 888 889
		xas_for_each_conflict(&xas, entry) {
			old = entry;
			if (!xa_is_value(entry)) {
				xas_set_err(&xas, -EEXIST);
				goto unlock;
			}
		}

		if (old) {
			if (shadowp)
				*shadowp = old;
			/* entry may have been split before we acquired lock */
			order = xa_get_order(xas.xa, xas.xa_index);
890
			if (order > folio_order(folio)) {
891 892
				/* How to handle large swap entries? */
				BUG_ON(shmem_mapping(mapping));
893 894 895 896 897
				xas_split(&xas, old, order);
				xas_reset(&xas);
			}
		}

898
		xas_store(&xas, folio);
899 900 901
		if (xas_error(&xas))
			goto unlock;

902
		mapping->nrpages += nr;
903 904

		/* hugetlb pages do not participate in page cache accounting */
905 906 907 908 909 910
		if (!huge) {
			__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
			if (folio_test_pmd_mappable(folio))
				__lruvec_stat_mod_folio(folio,
						NR_FILE_THPS, nr);
		}
911 912
unlock:
		xas_unlock_irq(&xas);
913
	} while (xas_nomem(&xas, gfp));
914

915
	if (xas_error(&xas))
916
		goto error;
917

918
	trace_mm_filemap_add_to_page_cache(folio);
919
	return 0;
920
error:
921 922
	if (charged)
		mem_cgroup_uncharge(folio);
923
	folio->mapping = NULL;
924
	/* Leave page->index set: truncation relies upon it */
925 926
	folio_put_refs(folio, nr);
	return xas_error(&xas);
Linus Torvalds's avatar
Linus Torvalds committed
927
}
928
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
929

930 931
int filemap_add_folio(struct address_space *mapping, struct folio *folio,
				pgoff_t index, gfp_t gfp)
Linus Torvalds's avatar
Linus Torvalds committed
932
{
933
	void *shadow = NULL;
934 935
	int ret;

936 937
	__folio_set_locked(folio);
	ret = __filemap_add_folio(mapping, folio, index, gfp, &shadow);
938
	if (unlikely(ret))
939
		__folio_clear_locked(folio);
940 941
	else {
		/*
942
		 * The folio might have been evicted from cache only
943
		 * recently, in which case it should be activated like
944 945
		 * any other repeatedly accessed folio.
		 * The exception is folios getting rewritten; evicting other
946 947
		 * data from the working set, only to cache data that will
		 * get overwritten with something else, is a waste of memory.
948
		 */
949 950 951 952
		WARN_ON_ONCE(folio_test_active(folio));
		if (!(gfp & __GFP_WRITE) && shadow)
			workingset_refault(folio, shadow);
		folio_add_lru(folio);
953
	}
Linus Torvalds's avatar
Linus Torvalds committed
954 955
	return ret;
}
956
EXPORT_SYMBOL_GPL(filemap_add_folio);
Linus Torvalds's avatar
Linus Torvalds committed
957

958
#ifdef CONFIG_NUMA
959
struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order)
960
{
961
	int n;
962
	struct folio *folio;
963

964
	if (cpuset_do_page_mem_spread()) {
965 966
		unsigned int cpuset_mems_cookie;
		do {
967
			cpuset_mems_cookie = read_mems_allowed_begin();
968
			n = cpuset_mem_spread_node();
969 970
			folio = __folio_alloc_node(gfp, order, n);
		} while (!folio && read_mems_allowed_retry(cpuset_mems_cookie));
971

972
		return folio;
973
	}
974
	return folio_alloc(gfp, order);
975
}
976
EXPORT_SYMBOL(filemap_alloc_folio);
977 978
#endif

979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016
/*
 * filemap_invalidate_lock_two - lock invalidate_lock for two mappings
 *
 * Lock exclusively invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to lock
 * @mapping2: the second mapping to lock
 */
void filemap_invalidate_lock_two(struct address_space *mapping1,
				 struct address_space *mapping2)
{
	if (mapping1 > mapping2)
		swap(mapping1, mapping2);
	if (mapping1)
		down_write(&mapping1->invalidate_lock);
	if (mapping2 && mapping1 != mapping2)
		down_write_nested(&mapping2->invalidate_lock, 1);
}
EXPORT_SYMBOL(filemap_invalidate_lock_two);

/*
 * filemap_invalidate_unlock_two - unlock invalidate_lock for two mappings
 *
 * Unlock exclusive invalidate_lock of any passed mapping that is not NULL.
 *
 * @mapping1: the first mapping to unlock
 * @mapping2: the second mapping to unlock
 */
void filemap_invalidate_unlock_two(struct address_space *mapping1,
				   struct address_space *mapping2)
{
	if (mapping1)
		up_write(&mapping1->invalidate_lock);
	if (mapping2 && mapping1 != mapping2)
		up_write(&mapping2->invalidate_lock);
}
EXPORT_SYMBOL(filemap_invalidate_unlock_two);

Linus Torvalds's avatar
Linus Torvalds committed
1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
/*
 * In order to wait for pages to become available there must be
 * waitqueues associated with pages. By using a hash table of
 * waitqueues where the bucket discipline is to maintain all
 * waiters on the same queue and wake all when any of the pages
 * become available, and for the woken contexts to check to be
 * sure the appropriate page became available, this saves space
 * at a cost of "thundering herd" phenomena during rare hash
 * collisions.
 */
1027 1028
#define PAGE_WAIT_TABLE_BITS 8
#define PAGE_WAIT_TABLE_SIZE (1 << PAGE_WAIT_TABLE_BITS)
1029
static wait_queue_head_t folio_wait_table[PAGE_WAIT_TABLE_SIZE] __cacheline_aligned;
1030

1031
static wait_queue_head_t *folio_waitqueue(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
1032
{
1033
	return &folio_wait_table[hash_ptr(folio, PAGE_WAIT_TABLE_BITS)];
Linus Torvalds's avatar
Linus Torvalds committed
1034 1035
}

1036
void __init pagecache_init(void)
Linus Torvalds's avatar
Linus Torvalds committed
1037
{
1038
	int i;
Linus Torvalds's avatar
Linus Torvalds committed
1039

1040
	for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
1041
		init_waitqueue_head(&folio_wait_table[i]);
1042 1043

	page_writeback_init();
Linus Torvalds's avatar
Linus Torvalds committed
1044 1045
}

1046 1047
/*
 * The page wait code treats the "wait->flags" somewhat unusually, because
1048
 * we have multiple different kinds of waits, not just the usual "exclusive"
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
 * one.
 *
 * We have:
 *
 *  (a) no special bits set:
 *
 *	We're just waiting for the bit to be released, and when a waker
 *	calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
 *	and remove it from the wait queue.
 *
 *	Simple and straightforward.
 *
 *  (b) WQ_FLAG_EXCLUSIVE:
 *
 *	The waiter is waiting to get the lock, and only one waiter should
 *	be woken up to avoid any thundering herd behavior. We'll set the
 *	WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
 *
 *	This is the traditional exclusive wait.
 *
1069
 *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
1070 1071 1072 1073 1074 1075 1076 1077 1078 1079
 *
 *	The waiter is waiting to get the bit, and additionally wants the
 *	lock to be transferred to it for fair lock behavior. If the lock
 *	cannot be taken, we stop walking the wait queue without waking
 *	the waiter.
 *
 *	This is the "fair lock handoff" case, and in addition to setting
 *	WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
 *	that it now has the lock.
 */
1080
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
1081
{
1082
	unsigned int flags;
1083 1084 1085 1086
	struct wait_page_key *key = arg;
	struct wait_page_queue *wait_page
		= container_of(wait, struct wait_page_queue, wait);

1087
	if (!wake_page_match(wait_page, key))
1088
		return 0;
1089

1090
	/*
1091 1092
	 * If it's a lock handoff wait, we get the bit for it, and
	 * stop walking (and do not wake it up) if we can't.
1093
	 */
1094 1095
	flags = wait->flags;
	if (flags & WQ_FLAG_EXCLUSIVE) {
1096
		if (test_bit(key->bit_nr, &key->folio->flags))
1097
			return -1;
1098
		if (flags & WQ_FLAG_CUSTOM) {
1099
			if (test_and_set_bit(key->bit_nr, &key->folio->flags))
1100 1101 1102
				return -1;
			flags |= WQ_FLAG_DONE;
		}
1103
	}
1104

1105 1106 1107 1108 1109 1110 1111
	/*
	 * We are holding the wait-queue lock, but the waiter that
	 * is waiting for this will be checking the flags without
	 * any locking.
	 *
	 * So update the flags atomically, and wake up the waiter
	 * afterwards to avoid any races. This store-release pairs
1112
	 * with the load-acquire in folio_wait_bit_common().
1113 1114
	 */
	smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
1115 1116 1117 1118 1119 1120
	wake_up_state(wait->private, mode);

	/*
	 * Ok, we have successfully done what we're waiting for,
	 * and we can unconditionally remove the wait entry.
	 *
1121 1122 1123
	 * Note that this pairs with the "finish_wait()" in the
	 * waiter, and has to be the absolute last thing we do.
	 * After this list_del_init(&wait->entry) the wait entry
1124 1125 1126
	 * might be de-allocated and the process might even have
	 * exited.
	 */
1127
	list_del_init_careful(&wait->entry);
1128
	return (flags & WQ_FLAG_EXCLUSIVE) != 0;
1129 1130
}

1131
static void folio_wake_bit(struct folio *folio, int bit_nr)
1132
{
1133
	wait_queue_head_t *q = folio_waitqueue(folio);
1134 1135
	struct wait_page_key key;
	unsigned long flags;
1136

1137
	key.folio = folio;
1138 1139 1140 1141
	key.bit_nr = bit_nr;
	key.page_match = 0;

	spin_lock_irqsave(&q->lock, flags);
1142
	__wake_up_locked_key(q, TASK_NORMAL, &key);
1143

1144
	/*
1145 1146 1147
	 * It's possible to miss clearing waiters here, when we woke our page
	 * waiters, but the hashed waitqueue has waiters for other pages on it.
	 * That's okay, it's a rare case. The next waker will clear it.
1148
	 *
1149 1150 1151
	 * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
	 * other), the flag may be cleared in the course of freeing the page;
	 * but that is not required for correctness.
1152
	 */
1153
	if (!waitqueue_active(q) || !key.page_match)
1154
		folio_clear_waiters(folio);
1155

1156 1157
	spin_unlock_irqrestore(&q->lock, flags);
}
1158

1159
/*
1160
 * A choice of three behaviors for folio_wait_bit_common():
1161 1162 1163
 */
enum behavior {
	EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like
1164
			 * __folio_lock() waiting on then setting PG_locked.
1165 1166
			 */
	SHARED,		/* Hold ref to page and check the bit when woken, like
1167
			 * folio_wait_writeback() waiting on PG_writeback.
1168 1169
			 */
	DROP,		/* Drop ref to page before wait, no check when woken,
1170
			 * like folio_put_wait_locked() on PG_locked.
1171 1172 1173
			 */
};

1174
/*
1175
 * Attempt to check (or get) the folio flag, and mark us done
1176
 * if successful.
1177
 */
1178
static inline bool folio_trylock_flag(struct folio *folio, int bit_nr,
1179 1180 1181
					struct wait_queue_entry *wait)
{
	if (wait->flags & WQ_FLAG_EXCLUSIVE) {
1182
		if (test_and_set_bit(bit_nr, &folio->flags))
1183
			return false;
1184
	} else if (test_bit(bit_nr, &folio->flags))
1185 1186
		return false;

1187
	wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
1188 1189 1190
	return true;
}

1191 1192 1193
/* How many times do we accept lock stealing from under a waiter? */
int sysctl_page_lock_unfairness = 5;

1194 1195
static inline int folio_wait_bit_common(struct folio *folio, int bit_nr,
		int state, enum behavior behavior)
1196
{
1197
	wait_queue_head_t *q = folio_waitqueue(folio);
1198
	int unfairness = sysctl_page_lock_unfairness;
1199
	struct wait_page_queue wait_page;
1200
	wait_queue_entry_t *wait = &wait_page.wait;
1201
	bool thrashing = false;
1202
	unsigned long pflags;
1203
	bool in_thrashing;
1204

1205
	if (bit_nr == PG_locked &&
1206
	    !folio_test_uptodate(folio) && folio_test_workingset(folio)) {
1207
		delayacct_thrashing_start(&in_thrashing);
1208
		psi_memstall_enter(&pflags);
1209 1210 1211
		thrashing = true;
	}

1212 1213
	init_wait(wait);
	wait->func = wake_page_function;
1214
	wait_page.folio = folio;
1215 1216
	wait_page.bit_nr = bit_nr;

1217 1218 1219 1220 1221 1222 1223 1224
repeat:
	wait->flags = 0;
	if (behavior == EXCLUSIVE) {
		wait->flags = WQ_FLAG_EXCLUSIVE;
		if (--unfairness < 0)
			wait->flags |= WQ_FLAG_CUSTOM;
	}

1225 1226 1227 1228
	/*
	 * Do one last check whether we can get the
	 * page bit synchronously.
	 *
1229
	 * Do the folio_set_waiters() marking before that
1230 1231 1232 1233 1234 1235 1236 1237 1238 1239
	 * to let any waker we _just_ missed know they
	 * need to wake us up (otherwise they'll never
	 * even go to the slow case that looks at the
	 * page queue), and add ourselves to the wait
	 * queue if we need to sleep.
	 *
	 * This part needs to be done under the queue
	 * lock to avoid races.
	 */
	spin_lock_irq(&q->lock);
1240 1241
	folio_set_waiters(folio);
	if (!folio_trylock_flag(folio, bit_nr, wait))
1242 1243
		__add_wait_queue_entry_tail(q, wait);
	spin_unlock_irq(&q->lock);
1244

1245 1246
	/*
	 * From now on, all the logic will be based on
1247 1248 1249
	 * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
	 * see whether the page bit testing has already
	 * been done by the wake function.
1250
	 *
1251
	 * We can drop our reference to the folio.
1252 1253
	 */
	if (behavior == DROP)
1254
		folio_put(folio);
1255

1256 1257 1258 1259 1260 1261
	/*
	 * Note that until the "finish_wait()", or until
	 * we see the WQ_FLAG_WOKEN flag, we need to
	 * be very careful with the 'wait->flags', because
	 * we may race with a waker that sets them.
	 */
1262
	for (;;) {
1263 1264
		unsigned int flags;

1265 1266
		set_current_state(state);

1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
		/* Loop until we've been woken or interrupted */
		flags = smp_load_acquire(&wait->flags);
		if (!(flags & WQ_FLAG_WOKEN)) {
			if (signal_pending_state(state, current))
				break;

			io_schedule();
			continue;
		}

		/* If we were non-exclusive, we're done */
		if (behavior != EXCLUSIVE)
1279
			break;
1280

1281 1282
		/* If the waker got the lock for us, we're done */
		if (flags & WQ_FLAG_DONE)
1283
			break;
1284

1285 1286 1287 1288 1289 1290
		/*
		 * Otherwise, if we're getting the lock, we need to
		 * try to get it ourselves.
		 *
		 * And if that fails, we'll have to retry this all.
		 */
1291
		if (unlikely(test_and_set_bit(bit_nr, folio_flags(folio, 0))))
1292 1293 1294 1295
			goto repeat;

		wait->flags |= WQ_FLAG_DONE;
		break;
1296 1297
	}

1298 1299
	/*
	 * If a signal happened, this 'finish_wait()' may remove the last
1300
	 * waiter from the wait-queues, but the folio waiters bit will remain
1301 1302 1303
	 * set. That's ok. The next wakeup will take care of it, and trying
	 * to do it here would be difficult and prone to races.
	 */
1304 1305
	finish_wait(q, wait);

1306
	if (thrashing) {
1307
		delayacct_thrashing_end(&in_thrashing);
1308 1309
		psi_memstall_leave(&pflags);
	}
1310

1311
	/*
1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322
	 * NOTE! The wait->flags weren't stable until we've done the
	 * 'finish_wait()', and we could have exited the loop above due
	 * to a signal, and had a wakeup event happen after the signal
	 * test but before the 'finish_wait()'.
	 *
	 * So only after the finish_wait() can we reliably determine
	 * if we got woken up or not, so we can now figure out the final
	 * return value based on that state without races.
	 *
	 * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
	 * waiter, but an exclusive one requires WQ_FLAG_DONE.
1323
	 */
1324 1325
	if (behavior == EXCLUSIVE)
		return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
1326

1327
	return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
1328 1329
}

1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341
#ifdef CONFIG_MIGRATION
/**
 * migration_entry_wait_on_locked - Wait for a migration entry to be removed
 * @entry: migration swap entry.
 * @ptl: already locked ptl. This function will drop the lock.
 *
 * Wait for a migration entry referencing the given page to be removed. This is
 * equivalent to put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE) except
 * this can be called without taking a reference on the page. Instead this
 * should be called while holding the ptl for the migration entry referencing
 * the page.
 *
1342
 * Returns after unlocking the ptl.
1343 1344 1345 1346
 *
 * This follows the same logic as folio_wait_bit_common() so see the comments
 * there.
 */
1347 1348
void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl)
	__releases(ptl)
1349 1350 1351 1352 1353
{
	struct wait_page_queue wait_page;
	wait_queue_entry_t *wait = &wait_page.wait;
	bool thrashing = false;
	unsigned long pflags;
1354
	bool in_thrashing;
1355 1356 1357 1358 1359
	wait_queue_head_t *q;
	struct folio *folio = page_folio(pfn_swap_entry_to_page(entry));

	q = folio_waitqueue(folio);
	if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) {
1360
		delayacct_thrashing_start(&in_thrashing);
1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381
		psi_memstall_enter(&pflags);
		thrashing = true;
	}

	init_wait(wait);
	wait->func = wake_page_function;
	wait_page.folio = folio;
	wait_page.bit_nr = PG_locked;
	wait->flags = 0;

	spin_lock_irq(&q->lock);
	folio_set_waiters(folio);
	if (!folio_trylock_flag(folio, PG_locked, wait))
		__add_wait_queue_entry_tail(q, wait);
	spin_unlock_irq(&q->lock);

	/*
	 * If a migration entry exists for the page the migration path must hold
	 * a valid reference to the page, and it must take the ptl to remove the
	 * migration entry. So the page is valid until the ptl is dropped.
	 */
1382
	spin_unlock(ptl);
1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403

	for (;;) {
		unsigned int flags;

		set_current_state(TASK_UNINTERRUPTIBLE);

		/* Loop until we've been woken or interrupted */
		flags = smp_load_acquire(&wait->flags);
		if (!(flags & WQ_FLAG_WOKEN)) {
			if (signal_pending_state(TASK_UNINTERRUPTIBLE, current))
				break;

			io_schedule();
			continue;
		}
		break;
	}

	finish_wait(q, wait);

	if (thrashing) {
1404
		delayacct_thrashing_end(&in_thrashing);
1405 1406 1407 1408 1409
		psi_memstall_leave(&pflags);
	}
}
#endif

1410
void folio_wait_bit(struct folio *folio, int bit_nr)
1411
{
1412
	folio_wait_bit_common(folio, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
1413
}
1414
EXPORT_SYMBOL(folio_wait_bit);
1415

1416
int folio_wait_bit_killable(struct folio *folio, int bit_nr)
1417
{
1418
	return folio_wait_bit_common(folio, bit_nr, TASK_KILLABLE, SHARED);
1419
}
1420
EXPORT_SYMBOL(folio_wait_bit_killable);
1421

1422
/**
1423 1424
 * folio_put_wait_locked - Drop a reference and wait for it to be unlocked
 * @folio: The folio to wait for.
1425
 * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
1426
 *
1427
 * The caller should hold a reference on @folio.  They expect the page to
1428
 * become unlocked relatively soon, but do not wish to hold up migration
1429
 * (for example) by holding the reference while waiting for the folio to
1430
 * come unlocked.  After this function returns, the caller should not
1431
 * dereference @folio.
1432
 *
1433
 * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal.
1434
 */
1435
static int folio_put_wait_locked(struct folio *folio, int state)
1436
{
1437
	return folio_wait_bit_common(folio, PG_locked, state, DROP);
1438 1439
}

1440
/**
1441 1442
 * folio_add_wait_queue - Add an arbitrary waiter to a folio's wait queue
 * @folio: Folio defining the wait queue of interest
1443
 * @waiter: Waiter to add to the queue
1444
 *
1445
 * Add an arbitrary @waiter to the wait queue for the nominated @folio.
1446
 */
1447
void folio_add_wait_queue(struct folio *folio, wait_queue_entry_t *waiter)
1448
{
1449
	wait_queue_head_t *q = folio_waitqueue(folio);
1450 1451 1452
	unsigned long flags;

	spin_lock_irqsave(&q->lock, flags);
1453
	__add_wait_queue_entry_tail(q, waiter);
1454
	folio_set_waiters(folio);
1455 1456
	spin_unlock_irqrestore(&q->lock, flags);
}
1457
EXPORT_SYMBOL_GPL(folio_add_wait_queue);
1458

Linus Torvalds's avatar
Linus Torvalds committed
1459
/**
1460 1461 1462 1463 1464 1465 1466
 * folio_unlock - Unlock a locked folio.
 * @folio: The folio.
 *
 * Unlocks the folio and wakes up any thread sleeping on the page lock.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
Linus Torvalds's avatar
Linus Torvalds committed
1467
 */
1468
void folio_unlock(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
1469
{
1470
	/* Bit 7 allows x86 to check the byte's sign bit */
1471
	BUILD_BUG_ON(PG_waiters != 7);
1472 1473
	BUILD_BUG_ON(PG_locked > 7);
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
1474
	if (folio_xor_flags_has_waiters(folio, 1 << PG_locked))
1475
		folio_wake_bit(folio, PG_locked);
Linus Torvalds's avatar
Linus Torvalds committed
1476
}
1477
EXPORT_SYMBOL(folio_unlock);
Linus Torvalds's avatar
Linus Torvalds committed
1478

1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494
/**
 * folio_end_read - End read on a folio.
 * @folio: The folio.
 * @success: True if all reads completed successfully.
 *
 * When all reads against a folio have completed, filesystems should
 * call this function to let the pagecache know that no more reads
 * are outstanding.  This will unlock the folio and wake up any thread
 * sleeping on the lock.  The folio will also be marked uptodate if all
 * reads succeeded.
 *
 * Context: May be called from interrupt or process context.  May not be
 * called from NMI context.
 */
void folio_end_read(struct folio *folio, bool success)
{
1495 1496 1497 1498 1499 1500 1501
	unsigned long mask = 1 << PG_locked;

	/* Must be in bottom byte for x86 to work */
	BUILD_BUG_ON(PG_uptodate > 7);
	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
	VM_BUG_ON_FOLIO(folio_test_uptodate(folio), folio);

1502
	if (likely(success))
1503 1504 1505
		mask |= 1 << PG_uptodate;
	if (folio_xor_flags_has_waiters(folio, mask))
		folio_wake_bit(folio, PG_locked);
1506 1507 1508
}
EXPORT_SYMBOL(folio_end_read);

1509
/**
1510 1511
 * folio_end_private_2 - Clear PG_private_2 and wake any waiters.
 * @folio: The folio.
1512
 *
1513 1514
 * Clear the PG_private_2 bit on a folio and wake up any sleepers waiting for
 * it.  The folio reference held for PG_private_2 being set is released.
1515
 *
1516 1517
 * This is, for example, used when a netfs folio is being written to a local
 * disk cache, thereby allowing writes to the cache for the same folio to be
1518 1519
 * serialised.
 */
1520
void folio_end_private_2(struct folio *folio)
1521
{
1522 1523 1524 1525
	VM_BUG_ON_FOLIO(!folio_test_private_2(folio), folio);
	clear_bit_unlock(PG_private_2, folio_flags(folio, 0));
	folio_wake_bit(folio, PG_private_2);
	folio_put(folio);
1526
}
1527
EXPORT_SYMBOL(folio_end_private_2);
1528 1529

/**
1530 1531
 * folio_wait_private_2 - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
1532
 *
1533
 * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio.
1534
 */
1535
void folio_wait_private_2(struct folio *folio)
1536
{
1537 1538
	while (folio_test_private_2(folio))
		folio_wait_bit(folio, PG_private_2);
1539
}
1540
EXPORT_SYMBOL(folio_wait_private_2);
1541 1542

/**
1543 1544
 * folio_wait_private_2_killable - Wait for PG_private_2 to be cleared on a folio.
 * @folio: The folio to wait on.
1545
 *
1546
 * Wait for PG_private_2 (aka PG_fscache) to be cleared on a folio or until a
1547 1548 1549 1550 1551 1552
 * fatal signal is received by the calling task.
 *
 * Return:
 * - 0 if successful.
 * - -EINTR if a fatal signal was encountered.
 */
1553
int folio_wait_private_2_killable(struct folio *folio)
1554 1555 1556
{
	int ret = 0;

1557 1558
	while (folio_test_private_2(folio)) {
		ret = folio_wait_bit_killable(folio, PG_private_2);
1559 1560 1561 1562 1563 1564
		if (ret < 0)
			break;
	}

	return ret;
}
1565
EXPORT_SYMBOL(folio_wait_private_2_killable);
1566

1567
/**
1568 1569
 * folio_end_writeback - End writeback against a folio.
 * @folio: The folio.
1570 1571 1572 1573
 *
 * The folio must actually be under writeback.
 *
 * Context: May be called from process or interrupt context.
Linus Torvalds's avatar
Linus Torvalds committed
1574
 */
1575
void folio_end_writeback(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
1576
{
1577 1578
	VM_BUG_ON_FOLIO(!folio_test_writeback(folio), folio);

1579
	/*
1580 1581 1582 1583 1584
	 * folio_test_clear_reclaim() could be used here but it is an
	 * atomic operation and overkill in this particular case. Failing
	 * to shuffle a folio marked for immediate reclaim is too mild
	 * a gain to justify taking an atomic operation penalty at the
	 * end of every folio writeback.
1585
	 */
1586 1587
	if (folio_test_reclaim(folio)) {
		folio_clear_reclaim(folio);
1588
		folio_rotate_reclaimable(folio);
1589
	}
1590

1591
	/*
1592
	 * Writeback does not hold a folio reference of its own, relying
1593
	 * on truncation to wait for the clearing of PG_writeback.
1594
	 * But here we must make sure that the folio is not freed and
1595
	 * reused before the folio_wake_bit().
1596
	 */
1597
	folio_get(folio);
1598 1599
	if (__folio_end_writeback(folio))
		folio_wake_bit(folio, PG_writeback);
1600
	acct_reclaim_writeback(folio);
1601
	folio_put(folio);
Linus Torvalds's avatar
Linus Torvalds committed
1602
}
1603
EXPORT_SYMBOL(folio_end_writeback);
Linus Torvalds's avatar
Linus Torvalds committed
1604

1605
/**
1606 1607
 * __folio_lock - Get a lock on the folio, assuming we need to sleep to get it.
 * @folio: The folio to lock
Linus Torvalds's avatar
Linus Torvalds committed
1608
 */
1609
void __folio_lock(struct folio *folio)
Linus Torvalds's avatar
Linus Torvalds committed
1610
{
1611
	folio_wait_bit_common(folio, PG_locked, TASK_UNINTERRUPTIBLE,
1612
				EXCLUSIVE);
Linus Torvalds's avatar
Linus Torvalds committed
1613
}
1614
EXPORT_SYMBOL(__folio_lock);
Linus Torvalds's avatar
Linus Torvalds committed
1615

1616
int __folio_lock_killable(struct folio *folio)
Matthew Wilcox's avatar
Matthew Wilcox committed
1617
{
1618
	return folio_wait_bit_common(folio, PG_locked, TASK_KILLABLE,
1619
					EXCLUSIVE);
Matthew Wilcox's avatar
Matthew Wilcox committed
1620
}
1621
EXPORT_SYMBOL_GPL(__folio_lock_killable);
Matthew Wilcox's avatar
Matthew Wilcox committed
1622

1623
static int __folio_lock_async(struct folio *folio, struct wait_page_queue *wait)
1624
{
1625
	struct wait_queue_head *q = folio_waitqueue(folio);
1626 1627
	int ret = 0;

1628
	wait->folio = folio;
1629 1630 1631 1632
	wait->bit_nr = PG_locked;

	spin_lock_irq(&q->lock);
	__add_wait_queue_entry_tail(q, &wait->wait);
1633 1634
	folio_set_waiters(folio);
	ret = !folio_trylock(folio);
1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646
	/*
	 * If we were successful now, we know we're still on the
	 * waitqueue as we're still under the lock. This means it's
	 * safe to remove and return success, we know the callback
	 * isn't going to trigger.
	 */
	if (!ret)
		__remove_wait_queue(q, &wait->wait);
	else
		ret = -EIOCBQUEUED;
	spin_unlock_irq(&q->lock);
	return ret;
1647 1648
}

1649 1650
/*
 * Return values:
1651 1652
 * 0 - folio is locked.
 * non-zero - folio is not locked.
1653 1654 1655
 *     mmap_lock or per-VMA lock has been released (mmap_read_unlock() or
 *     vma_end_read()), unless flags had both FAULT_FLAG_ALLOW_RETRY and
 *     FAULT_FLAG_RETRY_NOWAIT set, in which case the lock is still held.
1656
 *
1657
 * If neither ALLOW_RETRY nor KILLABLE are set, will always return 0
1658
 * with the folio locked and the mmap_lock/per-VMA lock is left unperturbed.
1659
 */
1660
vm_fault_t __folio_lock_or_retry(struct folio *folio, struct vm_fault *vmf)
1661
{
1662 1663
	unsigned int flags = vmf->flags;

1664
	if (fault_flag_allow_retry_first(flags)) {
1665
		/*
1666 1667
		 * CAUTION! In this case, mmap_lock/per-VMA lock is not
		 * released even though returning VM_FAULT_RETRY.
1668 1669
		 */
		if (flags & FAULT_FLAG_RETRY_NOWAIT)
1670
			return VM_FAULT_RETRY;
1671

1672
		release_fault_lock(vmf);
1673
		if (flags & FAULT_FLAG_KILLABLE)
1674
			folio_wait_locked_killable(folio);
1675
		else
1676
			folio_wait_locked(folio);
1677
		return VM_FAULT_RETRY;
1678 1679
	}
	if (flags & FAULT_FLAG_KILLABLE) {
1680
		bool ret;
1681

1682
		ret = __folio_lock_killable(folio);
1683
		if (ret) {
1684
			release_fault_lock(vmf);
1685
			return VM_FAULT_RETRY;
1686 1687
		}
	} else {
1688
		__folio_lock(folio);
1689
	}
1690

1691
	return 0;
1692 1693
}

1694
/**
1695 1696 1697 1698
 * page_cache_next_miss() - Find the next gap in the page cache.
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
1699
 *
1700 1701
 * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
 * gap with the lowest index.
1702
 *
1703 1704 1705 1706 1707
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 5, then subsequently a gap is
 * created at index 10, page_cache_next_miss covering both indices may
 * return 10 if called under the rcu_read_lock.
1708
 *
1709 1710
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'return - index >= max_scan' will be true).
1711
 * In the rare case of index wrap-around, 0 will be returned.
1712
 */
1713
pgoff_t page_cache_next_miss(struct address_space *mapping,
1714 1715
			     pgoff_t index, unsigned long max_scan)
{
1716
	XA_STATE(xas, &mapping->i_pages, index);
1717

1718 1719 1720
	while (max_scan--) {
		void *entry = xas_next(&xas);
		if (!entry || xa_is_value(entry))
1721 1722 1723
			break;
		if (xas.xa_index == 0)
			break;
1724 1725
	}

1726
	return xas.xa_index;
1727
}
1728
EXPORT_SYMBOL(page_cache_next_miss);
1729 1730

/**
1731
 * page_cache_prev_miss() - Find the previous gap in the page cache.
1732 1733 1734
 * @mapping: Mapping.
 * @index: Index.
 * @max_scan: Maximum range to search.
1735
 *
1736 1737
 * Search the range [max(index - max_scan + 1, 0), index] for the
 * gap with the highest index.
1738
 *
1739 1740 1741 1742 1743
 * This function may be called under the rcu_read_lock.  However, this will
 * not atomically search a snapshot of the cache at a single point in time.
 * For example, if a gap is created at index 10, then subsequently a gap is
 * created at index 5, page_cache_prev_miss() covering both indices may
 * return 5 if called under the rcu_read_lock.
1744
 *
1745 1746
 * Return: The index of the gap if found, otherwise an index outside the
 * range specified (in which case 'index - return >= max_scan' will be true).
1747
 * In the rare case of wrap-around, ULONG_MAX will be returned.
1748
 */
1749
pgoff_t page_cache_prev_miss(struct address_space *mapping,
1750 1751
			     pgoff_t index, unsigned long max_scan)
{
1752
	XA_STATE(xas, &mapping->i_pages, index);
1753

1754 1755 1756
	while (max_scan--) {
		void *entry = xas_prev(&xas);
		if (!entry || xa_is_value(entry))
1757 1758 1759
			break;
		if (xas.xa_index == ULONG_MAX)
			break;
1760 1761
	}

1762
	return xas.xa_index;
1763
}
1764
EXPORT_SYMBOL(page_cache_prev_miss);
1765

1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785
/*
 * Lockless page cache protocol:
 * On the lookup side:
 * 1. Load the folio from i_pages
 * 2. Increment the refcount if it's not zero
 * 3. If the folio is not found by xas_reload(), put the refcount and retry
 *
 * On the removal side:
 * A. Freeze the page (by zeroing the refcount if nobody else has a reference)
 * B. Remove the page from i_pages
 * C. Return the page to the page allocator
 *
 * This means that any page may have its reference count temporarily
 * increased by a speculative page cache (or fast GUP) lookup as it can
 * be allocated by another user before the RCU grace period expires.
 * Because the refcount temporarily acquired here may end up being the
 * last refcount on the page, any page allocation must be freeable by
 * folio_put().
 */

1786
/*
1787
 * filemap_get_entry - Get a page cache entry.
1788
 * @mapping: the address_space to search
1789
 * @index: The page cache index.
1790
 *
1791 1792 1793 1794
 * Looks up the page cache entry at @mapping & @index.  If it is a folio,
 * it is returned with an increased refcount.  If it is a shadow entry
 * of a previously evicted folio, or a swap entry from shmem/tmpfs,
 * it is returned without further action.
1795
 *
1796
 * Return: The folio, swap or shadow entry, %NULL if nothing is found.
Linus Torvalds's avatar
Linus Torvalds committed
1797
 */
1798
void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
Linus Torvalds's avatar
Linus Torvalds committed
1799
{
1800
	XA_STATE(xas, &mapping->i_pages, index);
1801
	struct folio *folio;
Linus Torvalds's avatar
Linus Torvalds committed
1802

Nick Piggin's avatar
Nick Piggin committed
1803 1804
	rcu_read_lock();
repeat:
1805
	xas_reset(&xas);
1806 1807
	folio = xas_load(&xas);
	if (xas_retry(&xas, folio))
1808 1809 1810 1811 1812
		goto repeat;
	/*
	 * A shadow entry of a recently evicted page, or a swap entry from
	 * shmem/tmpfs.  Return it without attempting to raise page count.
	 */
1813
	if (!folio || xa_is_value(folio))
1814
		goto out;
1815

1816
	if (!folio_try_get_rcu(folio))
1817
		goto repeat;
1818

1819 1820
	if (unlikely(folio != xas_reload(&xas))) {
		folio_put(folio);
1821
		goto repeat;
Nick Piggin's avatar
Nick Piggin committed
1822
	}
Nick Piggin's avatar
Nick Piggin committed
1823
out:
Nick Piggin's avatar
Nick Piggin committed
1824 1825
	rcu_read_unlock();

1826
	return folio;
Linus Torvalds's avatar
Linus Torvalds committed
1827 1828
}

1829
/**
1830
 * __filemap_get_folio - Find and get a reference to a folio.
1831 1832
 * @mapping: The address_space to search.
 * @index: The page index.
1833 1834
 * @fgp_flags: %FGP flags modify how the folio is returned.
 * @gfp: Memory allocation flags to use if %FGP_CREAT is specified.
Linus Torvalds's avatar
Linus Torvalds committed
1835
 *
1836
 * Looks up the page cache entry at @mapping & @index.
1837
 *
1838 1839
 * If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
 * if the %GFP flags specified for %FGP_CREAT are atomic.
Linus Torvalds's avatar
Linus Torvalds committed
1840
 *
1841
 * If this function returns a folio, it is returned with an increased refcount.
1842
 *
1843
 * Return: The found folio or an ERR_PTR() otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
1844
 */
1845
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
1846
		fgf_t fgp_flags, gfp_t gfp)
Linus Torvalds's avatar
Linus Torvalds committed
1847
{
1848
	struct folio *folio;
1849

Linus Torvalds's avatar
Linus Torvalds committed
1850
repeat:
1851
	folio = filemap_get_entry(mapping, index);
Christoph Hellwig's avatar
Christoph Hellwig committed
1852
	if (xa_is_value(folio))
1853 1854
		folio = NULL;
	if (!folio)
1855 1856 1857 1858
		goto no_page;

	if (fgp_flags & FGP_LOCK) {
		if (fgp_flags & FGP_NOWAIT) {
1859 1860
			if (!folio_trylock(folio)) {
				folio_put(folio);
1861
				return ERR_PTR(-EAGAIN);
1862 1863
			}
		} else {
1864
			folio_lock(folio);
1865 1866 1867
		}

		/* Has the page been truncated? */
1868 1869 1870
		if (unlikely(folio->mapping != mapping)) {
			folio_unlock(folio);
			folio_put(folio);
1871 1872
			goto repeat;
		}
1873
		VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
1874 1875
	}

1876
	if (fgp_flags & FGP_ACCESSED)
1877
		folio_mark_accessed(folio);
1878 1879
	else if (fgp_flags & FGP_WRITE) {
		/* Clear idle flag for buffer write */
1880 1881
		if (folio_test_idle(folio))
			folio_clear_idle(folio);
1882
	}
1883

1884 1885
	if (fgp_flags & FGP_STABLE)
		folio_wait_stable(folio);
1886
no_page:
1887
	if (!folio && (fgp_flags & FGP_CREAT)) {
1888
		unsigned order = FGF_GET_ORDER(fgp_flags);
1889
		int err;
1890

1891
		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
1892
			gfp |= __GFP_WRITE;
1893
		if (fgp_flags & FGP_NOFS)
1894
			gfp &= ~__GFP_FS;
1895 1896 1897 1898
		if (fgp_flags & FGP_NOWAIT) {
			gfp &= ~GFP_KERNEL;
			gfp |= GFP_NOWAIT | __GFP_NOWARN;
		}
1899
		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
1900 1901
			fgp_flags |= FGP_LOCK;

1902 1903 1904 1905 1906 1907 1908
		if (!mapping_large_folio_support(mapping))
			order = 0;
		if (order > MAX_PAGECACHE_ORDER)
			order = MAX_PAGECACHE_ORDER;
		/* If we're not aligned, allocate a smaller folio */
		if (index & ((1UL << order) - 1))
			order = __ffs(index);
1909

1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928
		do {
			gfp_t alloc_gfp = gfp;

			err = -ENOMEM;
			if (order == 1)
				order = 0;
			if (order > 0)
				alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN;
			folio = filemap_alloc_folio(alloc_gfp, order);
			if (!folio)
				continue;

			/* Init accessed so avoid atomic mark_page_accessed later */
			if (fgp_flags & FGP_ACCESSED)
				__folio_set_referenced(folio);

			err = filemap_add_folio(mapping, folio, index, gfp);
			if (!err)
				break;
1929 1930
			folio_put(folio);
			folio = NULL;
1931
		} while (order-- > 0);
1932

1933 1934 1935 1936
		if (err == -EEXIST)
			goto repeat;
		if (err)
			return ERR_PTR(err);
1937
		/*
1938 1939
		 * filemap_add_folio locks the page, and for mmap
		 * we expect an unlocked page.
1940
		 */
1941 1942
		if (folio && (fgp_flags & FGP_FOR_MMAP))
			folio_unlock(folio);
Linus Torvalds's avatar
Linus Torvalds committed
1943
	}
1944

1945 1946
	if (!folio)
		return ERR_PTR(-ENOENT);
1947
	return folio;
Linus Torvalds's avatar
Linus Torvalds committed
1948
}
1949
EXPORT_SYMBOL(__filemap_get_folio);
Linus Torvalds's avatar
Linus Torvalds committed
1950

1951
static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max,
1952 1953
		xa_mark_t mark)
{
1954
	struct folio *folio;
1955 1956 1957

retry:
	if (mark == XA_PRESENT)
1958
		folio = xas_find(xas, max);
1959
	else
1960
		folio = xas_find_marked(xas, max, mark);
1961

1962
	if (xas_retry(xas, folio))
1963 1964 1965 1966 1967 1968
		goto retry;
	/*
	 * A shadow entry of a recently evicted page, a swap
	 * entry from shmem/tmpfs or a DAX entry.  Return it
	 * without attempting to raise page count.
	 */
1969 1970
	if (!folio || xa_is_value(folio))
		return folio;
1971

1972
	if (!folio_try_get_rcu(folio))
1973 1974
		goto reset;

1975 1976
	if (unlikely(folio != xas_reload(xas))) {
		folio_put(folio);
1977 1978 1979
		goto reset;
	}

1980
	return folio;
1981 1982 1983 1984 1985
reset:
	xas_reset(xas);
	goto retry;
}

1986 1987 1988 1989
/**
 * find_get_entries - gang pagecache lookup
 * @mapping:	The address_space to search
 * @start:	The starting page cache index
1990
 * @end:	The final page index (inclusive).
1991
 * @fbatch:	Where the resulting entries are placed.
1992 1993
 * @indices:	The cache indices corresponding to the entries in @entries
 *
1994
 * find_get_entries() will search for and return a batch of entries in
1995 1996
 * the mapping.  The entries are placed in @fbatch.  find_get_entries()
 * takes a reference on any actual folios it returns.
1997
 *
1998 1999
 * The entries have ascending indexes.  The indices may not be consecutive
 * due to not-present entries or large folios.
2000
 *
2001
 * Any shadow entries of evicted folios, or swap entries from
2002
 * shmem/tmpfs, are included in the returned array.
2003
 *
2004
 * Return: The number of entries which were found.
2005
 */
2006
unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
2007
		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
2008
{
2009
	XA_STATE(xas, &mapping->i_pages, *start);
2010
	struct folio *folio;
2011 2012

	rcu_read_lock();
2013
	while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
2014 2015
		indices[fbatch->nr] = xas.xa_index;
		if (!folio_batch_add(fbatch, folio))
2016 2017 2018
			break;
	}
	rcu_read_unlock();
2019

2020 2021 2022 2023 2024
	if (folio_batch_count(fbatch)) {
		unsigned long nr = 1;
		int idx = folio_batch_count(fbatch) - 1;

		folio = fbatch->folios[idx];
2025
		if (!xa_is_value(folio))
2026 2027 2028
			nr = folio_nr_pages(folio);
		*start = indices[idx] + nr;
	}
2029
	return folio_batch_count(fbatch);
2030 2031
}

2032 2033 2034 2035 2036
/**
 * find_lock_entries - Find a batch of pagecache entries.
 * @mapping:	The address_space to search.
 * @start:	The starting page cache index.
 * @end:	The final page index (inclusive).
2037 2038
 * @fbatch:	Where the resulting entries are placed.
 * @indices:	The cache indices of the entries in @fbatch.
2039 2040
 *
 * find_lock_entries() will return a batch of entries from @mapping.
2041 2042 2043 2044
 * Swap, shadow and DAX entries are included.  Folios are returned
 * locked and with an incremented refcount.  Folios which are locked
 * by somebody else or under writeback are skipped.  Folios which are
 * partially outside the range are not returned.
2045 2046
 *
 * The entries have ascending indexes.  The indices may not be consecutive
2047 2048
 * due to not-present entries, large folios, folios which could not be
 * locked or folios under writeback.
2049 2050 2051
 *
 * Return: The number of entries which were found.
 */
2052
unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
2053
		pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices)
2054
{
2055
	XA_STATE(xas, &mapping->i_pages, *start);
2056
	struct folio *folio;
2057 2058

	rcu_read_lock();
2059 2060
	while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
		if (!xa_is_value(folio)) {
2061
			if (folio->index < *start)
2062
				goto put;
2063
			if (folio_next_index(folio) - 1 > end)
2064
				goto put;
2065
			if (!folio_trylock(folio))
2066
				goto put;
2067 2068
			if (folio->mapping != mapping ||
			    folio_test_writeback(folio))
2069
				goto unlock;
2070 2071
			VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
					folio);
2072
		}
2073 2074
		indices[fbatch->nr] = xas.xa_index;
		if (!folio_batch_add(fbatch, folio))
2075
			break;
2076
		continue;
2077
unlock:
2078
		folio_unlock(folio);
2079
put:
2080
		folio_put(folio);
2081 2082 2083
	}
	rcu_read_unlock();

2084 2085 2086 2087 2088
	if (folio_batch_count(fbatch)) {
		unsigned long nr = 1;
		int idx = folio_batch_count(fbatch) - 1;

		folio = fbatch->folios[idx];
2089
		if (!xa_is_value(folio))
2090 2091 2092
			nr = folio_nr_pages(folio);
		*start = indices[idx] + nr;
	}
2093
	return folio_batch_count(fbatch);
2094 2095
}

Linus Torvalds's avatar
Linus Torvalds committed
2096
/**
2097
 * filemap_get_folios - Get a batch of folios
Linus Torvalds's avatar
Linus Torvalds committed
2098 2099
 * @mapping:	The address_space to search
 * @start:	The starting page index
2100
 * @end:	The final page index (inclusive)
2101
 * @fbatch:	The batch to fill.
Linus Torvalds's avatar
Linus Torvalds committed
2102
 *
2103 2104 2105
 * Search for and return a batch of folios in the mapping starting at
 * index @start and up to index @end (inclusive).  The folios are returned
 * in @fbatch with an elevated reference count.
Linus Torvalds's avatar
Linus Torvalds committed
2106
 *
2107 2108
 * Return: The number of folios which were found.
 * We also update @start to index the next folio for the traversal.
Linus Torvalds's avatar
Linus Torvalds committed
2109
 */
2110 2111
unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start,
		pgoff_t end, struct folio_batch *fbatch)
Linus Torvalds's avatar
Linus Torvalds committed
2112
{
2113
	return filemap_get_folios_tag(mapping, start, end, XA_PRESENT, fbatch);
2114 2115 2116
}
EXPORT_SYMBOL(filemap_get_folios);

2117
/**
2118
 * filemap_get_folios_contig - Get a batch of contiguous folios
2119
 * @mapping:	The address_space to search
2120 2121 2122
 * @start:	The starting page index
 * @end:	The final page index (inclusive)
 * @fbatch:	The batch to fill
2123
 *
2124 2125 2126
 * filemap_get_folios_contig() works exactly like filemap_get_folios(),
 * except the returned folios are guaranteed to be contiguous. This may
 * not return all contiguous folios if the batch gets filled up.
2127
 *
2128 2129
 * Return: The number of folios found.
 * Also update @start to be positioned for traversal of the next folio.
2130
 */
2131 2132 2133

unsigned filemap_get_folios_contig(struct address_space *mapping,
		pgoff_t *start, pgoff_t end, struct folio_batch *fbatch)
2134
{
2135 2136
	XA_STATE(xas, &mapping->i_pages, *start);
	unsigned long nr;
2137
	struct folio *folio;
Nick Piggin's avatar
Nick Piggin committed
2138 2139

	rcu_read_lock();
2140 2141 2142

	for (folio = xas_load(&xas); folio && xas.xa_index <= end;
			folio = xas_next(&xas)) {
2143
		if (xas_retry(&xas, folio))
2144 2145 2146 2147 2148
			continue;
		/*
		 * If the entry has been swapped out, we can stop looking.
		 * No current caller is looking for DAX entries.
		 */
2149
		if (xa_is_value(folio))
2150
			goto update_start;
2151

2152
		if (!folio_try_get_rcu(folio))
2153
			goto retry;
2154

2155
		if (unlikely(folio != xas_reload(&xas)))
2156
			goto put_folio;
Nick Piggin's avatar
Nick Piggin committed
2157

2158 2159 2160 2161
		if (!folio_batch_add(fbatch, folio)) {
			nr = folio_nr_pages(folio);
			*start = folio->index + nr;
			goto out;
2162
		}
2163
		continue;
2164
put_folio:
2165
		folio_put(folio);
2166

2167 2168
retry:
		xas_reset(&xas);
2169
	}
2170 2171 2172 2173 2174 2175

update_start:
	nr = folio_batch_count(fbatch);

	if (nr) {
		folio = fbatch->folios[nr - 1];
2176
		*start = folio->index + folio_nr_pages(folio);
2177 2178
	}
out:
Nick Piggin's avatar
Nick Piggin committed
2179
	rcu_read_unlock();
2180
	return folio_batch_count(fbatch);
2181
}
2182
EXPORT_SYMBOL(filemap_get_folios_contig);
2183

2184
/**
2185 2186 2187 2188 2189 2190
 * filemap_get_folios_tag - Get a batch of folios matching @tag
 * @mapping:    The address_space to search
 * @start:      The starting page index
 * @end:        The final page index (inclusive)
 * @tag:        The tag index
 * @fbatch:     The batch to fill
2191
 *
2192 2193 2194 2195 2196 2197 2198
 * The first folio may start before @start; if it does, it will contain
 * @start.  The final folio may extend beyond @end; if it does, it will
 * contain @end.  The folios have ascending indices.  There may be gaps
 * between the folios if there are indices which have no folio in the
 * page cache.  If folios are added to or removed from the page cache
 * while this is running, they may or may not be found by this call.
 * Only returns folios that are tagged with @tag.
2199
 *
2200 2201
 * Return: The number of folios found.
 * Also update @start to index the next folio for traversal.
Linus Torvalds's avatar
Linus Torvalds committed
2202
 */
2203 2204
unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start,
			pgoff_t end, xa_mark_t tag, struct folio_batch *fbatch)
Linus Torvalds's avatar
Linus Torvalds committed
2205
{
2206
	XA_STATE(xas, &mapping->i_pages, *start);
2207
	struct folio *folio;
Nick Piggin's avatar
Nick Piggin committed
2208 2209

	rcu_read_lock();
2210
	while ((folio = find_get_entry(&xas, end, tag)) != NULL) {
2211 2212 2213
		/*
		 * Shadow entries should never be tagged, but this iteration
		 * is lockless so there is a window for page reclaim to evict
2214
		 * a page we saw tagged. Skip over it.
2215
		 */
2216
		if (xa_is_value(folio))
2217
			continue;
2218 2219 2220
		if (!folio_batch_add(fbatch, folio)) {
			unsigned long nr = folio_nr_pages(folio);
			*start = folio->index + nr;
2221 2222
			goto out;
		}
Nick Piggin's avatar
Nick Piggin committed
2223
	}
2224
	/*
2225 2226 2227 2228
	 * We come here when there is no page beyond @end. We take care to not
	 * overflow the index @start as it confuses some of the callers. This
	 * breaks the iteration when there is a page at index -1 but that is
	 * already broke anyway.
2229 2230
	 */
	if (end == (pgoff_t)-1)
2231
		*start = (pgoff_t)-1;
2232
	else
2233
		*start = end + 1;
2234
out:
Nick Piggin's avatar
Nick Piggin committed
2235
	rcu_read_unlock();
Linus Torvalds's avatar
Linus Torvalds committed
2236

2237
	return folio_batch_count(fbatch);
Linus Torvalds's avatar
Linus Torvalds committed
2238
}
2239
EXPORT_SYMBOL(filemap_get_folios_tag);
Linus Torvalds's avatar
Linus Torvalds committed
2240

2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255
/*
 * CD/DVDs are error prone. When a medium error occurs, the driver may fail
 * a _large_ part of the i/o request. Imagine the worst scenario:
 *
 *      ---R__________________________________________B__________
 *         ^ reading here                             ^ bad block(assume 4k)
 *
 * read(R) => miss => readahead(R...B) => media error => frustrating retries
 * => failing the whole request => read(R) => read(R+1) =>
 * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
 * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
 * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
 *
 * It is going insane. Fix it by quickly scaling down the readahead size.
 */
2256
static void shrink_readahead_size_eio(struct file_ra_state *ra)
2257 2258 2259 2260
{
	ra->ra_pages /= 4;
}

2261
/*
2262
 * filemap_get_read_batch - Get a batch of folios for read
2263
 *
2264 2265 2266 2267 2268
 * Get a batch of folios which represent a contiguous range of bytes in
 * the file.  No exceptional entries will be returned.  If @index is in
 * the middle of a folio, the entire folio will be returned.  The last
 * folio in the batch may have the readahead flag set or the uptodate flag
 * clear so that the caller can take the appropriate action.
2269 2270
 */
static void filemap_get_read_batch(struct address_space *mapping,
2271
		pgoff_t index, pgoff_t max, struct folio_batch *fbatch)
2272 2273
{
	XA_STATE(xas, &mapping->i_pages, index);
2274
	struct folio *folio;
2275 2276

	rcu_read_lock();
2277 2278
	for (folio = xas_load(&xas); folio; folio = xas_next(&xas)) {
		if (xas_retry(&xas, folio))
2279
			continue;
2280
		if (xas.xa_index > max || xa_is_value(folio))
2281
			break;
2282 2283
		if (xa_is_sibling(folio))
			break;
2284
		if (!folio_try_get_rcu(folio))
2285 2286
			goto retry;

2287
		if (unlikely(folio != xas_reload(&xas)))
2288
			goto put_folio;
2289

2290
		if (!folio_batch_add(fbatch, folio))
2291
			break;
2292
		if (!folio_test_uptodate(folio))
2293
			break;
2294
		if (folio_test_readahead(folio))
2295
			break;
2296
		xas_advance(&xas, folio_next_index(folio) - 1);
2297
		continue;
2298
put_folio:
2299
		folio_put(folio);
2300 2301 2302 2303 2304 2305
retry:
		xas_reset(&xas);
	}
	rcu_read_unlock();
}

2306
static int filemap_read_folio(struct file *file, filler_t filler,
2307
		struct folio *folio)
2308
{
2309 2310
	bool workingset = folio_test_workingset(folio);
	unsigned long pflags;
2311 2312 2313
	int error;

	/*
2314
	 * A previous I/O error may have been due to temporary failures,
2315
	 * eg. multipath errors.  PG_error will be set again if read_folio
2316
	 * fails.
2317
	 */
2318
	folio_clear_error(folio);
2319

2320
	/* Start the actual read. The read will unlock the page. */
2321 2322
	if (unlikely(workingset))
		psi_memstall_enter(&pflags);
2323
	error = filler(file, folio);
2324 2325
	if (unlikely(workingset))
		psi_memstall_leave(&pflags);
2326 2327
	if (error)
		return error;
2328

2329
	error = folio_wait_locked_killable(folio);
2330 2331
	if (error)
		return error;
2332
	if (folio_test_uptodate(folio))
2333
		return 0;
2334 2335
	if (file)
		shrink_readahead_size_eio(&file->f_ra);
2336
	return -EIO;
2337 2338
}

2339
static bool filemap_range_uptodate(struct address_space *mapping,
2340 2341
		loff_t pos, size_t count, struct folio *folio,
		bool need_uptodate)
2342
{
2343
	if (folio_test_uptodate(folio))
2344 2345
		return true;
	/* pipes can't handle partially uptodate pages */
2346
	if (need_uptodate)
2347 2348 2349
		return false;
	if (!mapping->a_ops->is_partially_uptodate)
		return false;
2350
	if (mapping->host->i_blkbits >= folio_shift(folio))
2351 2352
		return false;

2353 2354
	if (folio_pos(folio) > pos) {
		count -= folio_pos(folio) - pos;
2355 2356
		pos = 0;
	} else {
2357
		pos -= folio_pos(folio);
2358 2359
	}

2360
	return mapping->a_ops->is_partially_uptodate(folio, pos, count);
2361 2362
}

2363
static int filemap_update_page(struct kiocb *iocb,
2364 2365
		struct address_space *mapping, size_t count,
		struct folio *folio, bool need_uptodate)
2366 2367 2368
{
	int error;

2369 2370 2371 2372 2373 2374 2375
	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (!filemap_invalidate_trylock_shared(mapping))
			return -EAGAIN;
	} else {
		filemap_invalidate_lock_shared(mapping);
	}

2376
	if (!folio_trylock(folio)) {
2377
		error = -EAGAIN;
2378
		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
2379
			goto unlock_mapping;
2380
		if (!(iocb->ki_flags & IOCB_WAITQ)) {
2381
			filemap_invalidate_unlock_shared(mapping);
2382 2383 2384 2385 2386
			/*
			 * This is where we usually end up waiting for a
			 * previously submitted readahead to finish.
			 */
			folio_put_wait_locked(folio, TASK_KILLABLE);
2387
			return AOP_TRUNCATED_PAGE;
2388
		}
2389
		error = __folio_lock_async(folio, iocb->ki_waitq);
2390
		if (error)
2391
			goto unlock_mapping;
2392 2393
	}

2394
	error = AOP_TRUNCATED_PAGE;
2395
	if (!folio->mapping)
2396
		goto unlock;
2397

2398
	error = 0;
2399 2400
	if (filemap_range_uptodate(mapping, iocb->ki_pos, count, folio,
				   need_uptodate))
2401 2402 2403 2404 2405 2406
		goto unlock;

	error = -EAGAIN;
	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
		goto unlock;

2407 2408
	error = filemap_read_folio(iocb->ki_filp, mapping->a_ops->read_folio,
			folio);
2409
	goto unlock_mapping;
2410
unlock:
2411
	folio_unlock(folio);
2412 2413 2414
unlock_mapping:
	filemap_invalidate_unlock_shared(mapping);
	if (error == AOP_TRUNCATED_PAGE)
2415
		folio_put(folio);
2416
	return error;
2417 2418
}

2419
static int filemap_create_folio(struct file *file,
2420
		struct address_space *mapping, pgoff_t index,
2421
		struct folio_batch *fbatch)
2422
{
2423
	struct folio *folio;
2424 2425
	int error;

2426 2427
	folio = filemap_alloc_folio(mapping_gfp_mask(mapping), 0);
	if (!folio)
2428
		return -ENOMEM;
2429

2430
	/*
2431 2432 2433 2434 2435 2436 2437 2438
	 * Protect against truncate / hole punch. Grabbing invalidate_lock
	 * here assures we cannot instantiate and bring uptodate new
	 * pagecache folios after evicting page cache during truncate
	 * and before actually freeing blocks.	Note that we could
	 * release invalidate_lock after inserting the folio into
	 * the page cache as the locked folio would then be enough to
	 * synchronize with hole punching. But there are code paths
	 * such as filemap_update_page() filling in partially uptodate
2439
	 * pages or ->readahead() that need to hold invalidate_lock
2440 2441
	 * while mapping blocks for IO so let's hold the lock here as
	 * well to keep locking rules simple.
2442 2443
	 */
	filemap_invalidate_lock_shared(mapping);
2444
	error = filemap_add_folio(mapping, folio, index,
2445 2446 2447 2448 2449 2450
			mapping_gfp_constraint(mapping, GFP_KERNEL));
	if (error == -EEXIST)
		error = AOP_TRUNCATED_PAGE;
	if (error)
		goto error;

2451
	error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
2452 2453 2454
	if (error)
		goto error;

2455
	filemap_invalidate_unlock_shared(mapping);
2456
	folio_batch_add(fbatch, folio);
2457 2458
	return 0;
error:
2459
	filemap_invalidate_unlock_shared(mapping);
2460
	folio_put(folio);
2461
	return error;
2462 2463
}

2464
static int filemap_readahead(struct kiocb *iocb, struct file *file,
2465
		struct address_space *mapping, struct folio *folio,
2466 2467
		pgoff_t last_index)
{
2468 2469
	DEFINE_READAHEAD(ractl, file, &file->f_ra, mapping, folio->index);

2470 2471
	if (iocb->ki_flags & IOCB_NOIO)
		return -EAGAIN;
2472
	page_cache_async_ra(&ractl, folio, last_index - folio->index);
2473 2474 2475
	return 0;
}

2476 2477
static int filemap_get_pages(struct kiocb *iocb, size_t count,
		struct folio_batch *fbatch, bool need_uptodate)
2478 2479 2480 2481 2482
{
	struct file *filp = iocb->ki_filp;
	struct address_space *mapping = filp->f_mapping;
	struct file_ra_state *ra = &filp->f_ra;
	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
2483
	pgoff_t last_index;
2484
	struct folio *folio;
2485
	int err = 0;
2486

2487
	/* "last_index" is the index of the page beyond the end of the read */
2488
	last_index = DIV_ROUND_UP(iocb->ki_pos + count, PAGE_SIZE);
2489
retry:
2490 2491 2492
	if (fatal_signal_pending(current))
		return -EINTR;

2493
	filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2494
	if (!folio_batch_count(fbatch)) {
2495 2496 2497 2498
		if (iocb->ki_flags & IOCB_NOIO)
			return -EAGAIN;
		page_cache_sync_readahead(mapping, ra, filp, index,
				last_index - index);
2499
		filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
2500
	}
2501
	if (!folio_batch_count(fbatch)) {
2502 2503
		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
			return -EAGAIN;
2504
		err = filemap_create_folio(filp, mapping,
2505
				iocb->ki_pos >> PAGE_SHIFT, fbatch);
2506
		if (err == AOP_TRUNCATED_PAGE)
2507
			goto retry;
2508 2509
		return err;
	}
2510

2511
	folio = fbatch->folios[folio_batch_count(fbatch) - 1];
2512 2513
	if (folio_test_readahead(folio)) {
		err = filemap_readahead(iocb, filp, mapping, folio, last_index);
2514 2515 2516
		if (err)
			goto err;
	}
2517
	if (!folio_test_uptodate(folio)) {
2518 2519
		if ((iocb->ki_flags & IOCB_WAITQ) &&
		    folio_batch_count(fbatch) > 1)
2520
			iocb->ki_flags |= IOCB_NOWAIT;
2521 2522
		err = filemap_update_page(iocb, mapping, count, folio,
					  need_uptodate);
2523 2524
		if (err)
			goto err;
2525 2526
	}

2527
	return 0;
2528
err:
2529
	if (err < 0)
2530
		folio_put(folio);
2531
	if (likely(--fbatch->nr))
2532
		return 0;
2533
	if (err == AOP_TRUNCATED_PAGE)
2534 2535
		goto retry;
	return err;
2536 2537
}

2538 2539 2540 2541 2542 2543 2544
static inline bool pos_same_folio(loff_t pos1, loff_t pos2, struct folio *folio)
{
	unsigned int shift = folio_shift(folio);

	return (pos1 >> shift == pos2 >> shift);
}

2545
/**
2546 2547 2548 2549
 * filemap_read - Read data from the page cache.
 * @iocb: The iocb to read.
 * @iter: Destination for the data.
 * @already_read: Number of bytes already read by the caller.
2550
 *
2551
 * Copies data from the page cache.  If the data is not currently present,
2552
 * uses the readahead and read_folio address_space operations to fetch it.
Linus Torvalds's avatar
Linus Torvalds committed
2553
 *
2554 2555 2556
 * Return: Total number of bytes copied, including those already read by
 * the caller.  If an error happens before any bytes are copied, returns
 * a negative error number.
Linus Torvalds's avatar
Linus Torvalds committed
2557
 */
2558 2559
ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
		ssize_t already_read)
Linus Torvalds's avatar
Linus Torvalds committed
2560
{
2561
	struct file *filp = iocb->ki_filp;
2562
	struct file_ra_state *ra = &filp->f_ra;
2563
	struct address_space *mapping = filp->f_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
2564
	struct inode *inode = mapping->host;
2565
	struct folio_batch fbatch;
2566
	int i, error = 0;
2567 2568
	bool writably_mapped;
	loff_t isize, end_offset;
2569
	loff_t last_pos = ra->prev_pos;
Linus Torvalds's avatar
Linus Torvalds committed
2570

2571
	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
2572
		return 0;
2573 2574 2575
	if (unlikely(!iov_iter_count(iter)))
		return 0;

2576
	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
2577
	folio_batch_init(&fbatch);
2578

2579
	do {
Linus Torvalds's avatar
Linus Torvalds committed
2580
		cond_resched();
2581

2582
		/*
2583 2584 2585
		 * If we've already successfully copied some data, then we
		 * can no longer safely return -EIOCBQUEUED. Hence mark
		 * an async read NOWAIT at that point.
2586
		 */
2587
		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
2588 2589
			iocb->ki_flags |= IOCB_NOWAIT;

2590 2591 2592
		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
			break;

David Howells's avatar
David Howells committed
2593
		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
2594
		if (error < 0)
2595
			break;
Linus Torvalds's avatar
Linus Torvalds committed
2596

2597 2598 2599 2600 2601 2602 2603 2604 2605 2606
		/*
		 * i_size must be checked after we know the pages are Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */
		isize = i_size_read(inode);
		if (unlikely(iocb->ki_pos >= isize))
2607
			goto put_folios;
2608 2609 2610 2611 2612 2613 2614 2615 2616
		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);

		/*
		 * Once we start copying data, we don't want to be touching any
		 * cachelines that might be contended:
		 */
		writably_mapped = mapping_writably_mapped(mapping);

		/*
2617
		 * When a read accesses the same folio several times, only
2618 2619
		 * mark it as accessed the first time.
		 */
2620 2621
		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
				    fbatch.folios[0]))
2622
			folio_mark_accessed(fbatch.folios[0]);
2623

2624 2625
		for (i = 0; i < folio_batch_count(&fbatch); i++) {
			struct folio *folio = fbatch.folios[i];
2626 2627
			size_t fsize = folio_size(folio);
			size_t offset = iocb->ki_pos & (fsize - 1);
2628
			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
2629
					     fsize - offset);
2630
			size_t copied;
2631

2632
			if (end_offset < folio_pos(folio))
2633 2634
				break;
			if (i > 0)
2635
				folio_mark_accessed(folio);
2636
			/*
2637 2638 2639
			 * If users can be writing to this folio using arbitrary
			 * virtual addresses, take care of potential aliasing
			 * before reading the folio on the kernel side.
2640
			 */
2641 2642
			if (writably_mapped)
				flush_dcache_folio(folio);
2643

2644
			copied = copy_folio_to_iter(folio, offset, bytes, iter);
2645

2646
			already_read += copied;
2647
			iocb->ki_pos += copied;
2648
			last_pos = iocb->ki_pos;
2649 2650 2651 2652 2653

			if (copied < bytes) {
				error = -EFAULT;
				break;
			}
Linus Torvalds's avatar
Linus Torvalds committed
2654
		}
2655 2656 2657 2658
put_folios:
		for (i = 0; i < folio_batch_count(&fbatch); i++)
			folio_put(fbatch.folios[i]);
		folio_batch_init(&fbatch);
2659
	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
Linus Torvalds's avatar
Linus Torvalds committed
2660

2661
	file_accessed(filp);
2662
	ra->prev_pos = last_pos;
2663
	return already_read ? already_read : error;
Linus Torvalds's avatar
Linus Torvalds committed
2664
}
2665
EXPORT_SYMBOL_GPL(filemap_read);
Linus Torvalds's avatar
Linus Torvalds committed
2666

2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681
int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	loff_t pos = iocb->ki_pos;
	loff_t end = pos + count - 1;

	if (iocb->ki_flags & IOCB_NOWAIT) {
		if (filemap_range_needs_writeback(mapping, pos, end))
			return -EAGAIN;
		return 0;
	}

	return filemap_write_and_wait_range(mapping, pos, end);
}

2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	loff_t pos = iocb->ki_pos;
	loff_t end = pos + count - 1;
	int ret;

	if (iocb->ki_flags & IOCB_NOWAIT) {
		/* we could block if there are any pages in the range */
		if (filemap_range_has_page(mapping, pos, end))
			return -EAGAIN;
	} else {
		ret = filemap_write_and_wait_range(mapping, pos, end);
		if (ret)
			return ret;
	}

	/*
	 * After a write we want buffered reads to be sure to go to disk to get
	 * the new data.  We invalidate clean cached page from the region we're
	 * about to write.  We do this *before* the write so that we can return
	 * without clobbering -EIOCBQUEUED from ->direct_IO().
	 */
	return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
					     end >> PAGE_SHIFT);
}

2709
/**
Al Viro's avatar
Al Viro committed
2710
 * generic_file_read_iter - generic filesystem read routine
2711
 * @iocb:	kernel I/O control block
Al Viro's avatar
Al Viro committed
2712
 * @iter:	destination for the data read
2713
 *
Al Viro's avatar
Al Viro committed
2714
 * This is the "read_iter()" routine for all filesystems
Linus Torvalds's avatar
Linus Torvalds committed
2715
 * that can use the page cache directly.
2716 2717 2718 2719 2720 2721 2722 2723 2724 2725
 *
 * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
 * be returned when no data can be read without waiting for I/O requests
 * to complete; it doesn't prevent readahead.
 *
 * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
 * requests shall be made for the read or for readahead.  When no data
 * can be read, -EAGAIN shall be returned.  When readahead would be
 * triggered, a partial, possibly empty read shall be returned.
 *
2726 2727
 * Return:
 * * number of bytes copied, even for partial reads
2728
 * * negative error code (or 0 if IOCB_NOIO) if nothing was read
Linus Torvalds's avatar
Linus Torvalds committed
2729 2730
 */
ssize_t
2731
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
Linus Torvalds's avatar
Linus Torvalds committed
2732
{
2733
	size_t count = iov_iter_count(iter);
2734
	ssize_t retval = 0;
2735 2736

	if (!count)
2737
		return 0; /* skip atime */
Linus Torvalds's avatar
Linus Torvalds committed
2738

2739
	if (iocb->ki_flags & IOCB_DIRECT) {
2740
		struct file *file = iocb->ki_filp;
2741 2742
		struct address_space *mapping = file->f_mapping;
		struct inode *inode = mapping->host;
Linus Torvalds's avatar
Linus Torvalds committed
2743

2744 2745 2746
		retval = kiocb_write_and_wait(iocb, count);
		if (retval < 0)
			return retval;
2747 2748
		file_accessed(file);

2749
		retval = mapping->a_ops->direct_IO(iocb, iter);
2750
		if (retval >= 0) {
2751
			iocb->ki_pos += retval;
2752
			count -= retval;
2753
		}
2754 2755
		if (retval != -EIOCBQUEUED)
			iov_iter_revert(iter, count - iov_iter_count(iter));
2756

2757 2758 2759 2760 2761 2762
		/*
		 * Btrfs can have a short DIO read if we encounter
		 * compressed extents, so if there was an error, or if
		 * we've already read everything we wanted to, or if
		 * there was a short read because we hit EOF, go ahead
		 * and return.  Otherwise fallthrough to buffered io for
2763 2764
		 * the rest of the read.  Buffered reads will not work for
		 * DAX files, so don't bother trying.
2765
		 */
2766 2767 2768
		if (retval < 0 || !count || IS_DAX(inode))
			return retval;
		if (iocb->ki_pos >= i_size_read(inode))
2769
			return retval;
Linus Torvalds's avatar
Linus Torvalds committed
2770 2771
	}

2772
	return filemap_read(iocb, iter, retval);
Linus Torvalds's avatar
Linus Torvalds committed
2773
}
2774
EXPORT_SYMBOL(generic_file_read_iter);
Linus Torvalds's avatar
Linus Torvalds committed
2775

2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809
/*
 * Splice subpages from a folio into a pipe.
 */
size_t splice_folio_into_pipe(struct pipe_inode_info *pipe,
			      struct folio *folio, loff_t fpos, size_t size)
{
	struct page *page;
	size_t spliced = 0, offset = offset_in_folio(folio, fpos);

	page = folio_page(folio, offset / PAGE_SIZE);
	size = min(size, folio_size(folio) - offset);
	offset %= PAGE_SIZE;

	while (spliced < size &&
	       !pipe_full(pipe->head, pipe->tail, pipe->max_usage)) {
		struct pipe_buffer *buf = pipe_head_buf(pipe);
		size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced);

		*buf = (struct pipe_buffer) {
			.ops	= &page_cache_pipe_buf_ops,
			.page	= page,
			.offset	= offset,
			.len	= part,
		};
		folio_get(folio);
		pipe->head++;
		page++;
		spliced += part;
		offset = 0;
	}

	return spliced;
}

2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827
/**
 * filemap_splice_read -  Splice data from a file's pagecache into a pipe
 * @in: The file to read from
 * @ppos: Pointer to the file position to read from
 * @pipe: The pipe to splice into
 * @len: The amount to splice
 * @flags: The SPLICE_F_* flags
 *
 * This function gets folios from a file's pagecache and splices them into the
 * pipe.  Readahead will be called as necessary to fill more folios.  This may
 * be used for blockdevs also.
 *
 * Return: On success, the number of bytes read will be returned and *@ppos
 * will be updated if appropriate; 0 will be returned if there is no more data
 * to be read; -EAGAIN will be returned if the pipe had no space, and some
 * other negative error code will be returned on error.  A short read may occur
 * if the pipe has insufficient space, we reach the end of the data or we hit a
 * hole.
2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839
 */
ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
			    struct pipe_inode_info *pipe,
			    size_t len, unsigned int flags)
{
	struct folio_batch fbatch;
	struct kiocb iocb;
	size_t total_spliced = 0, used, npages;
	loff_t isize, end_offset;
	bool writably_mapped;
	int i, error = 0;

2840 2841 2842
	if (unlikely(*ppos >= in->f_mapping->host->i_sb->s_maxbytes))
		return 0;

2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855
	init_sync_kiocb(&iocb, in);
	iocb.ki_pos = *ppos;

	/* Work out how much data we can actually add into the pipe */
	used = pipe_occupancy(pipe->head, pipe->tail);
	npages = max_t(ssize_t, pipe->max_usage - used, 0);
	len = min_t(size_t, len, npages * PAGE_SIZE);

	folio_batch_init(&fbatch);

	do {
		cond_resched();

2856
		if (*ppos >= i_size_read(in->f_mapping->host))
2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871
			break;

		iocb.ki_pos = *ppos;
		error = filemap_get_pages(&iocb, len, &fbatch, true);
		if (error < 0)
			break;

		/*
		 * i_size must be checked after we know the pages are Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */
2872
		isize = i_size_read(in->f_mapping->host);
2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919
		if (unlikely(*ppos >= isize))
			break;
		end_offset = min_t(loff_t, isize, *ppos + len);

		/*
		 * Once we start copying data, we don't want to be touching any
		 * cachelines that might be contended:
		 */
		writably_mapped = mapping_writably_mapped(in->f_mapping);

		for (i = 0; i < folio_batch_count(&fbatch); i++) {
			struct folio *folio = fbatch.folios[i];
			size_t n;

			if (folio_pos(folio) >= end_offset)
				goto out;
			folio_mark_accessed(folio);

			/*
			 * If users can be writing to this folio using arbitrary
			 * virtual addresses, take care of potential aliasing
			 * before reading the folio on the kernel side.
			 */
			if (writably_mapped)
				flush_dcache_folio(folio);

			n = min_t(loff_t, len, isize - *ppos);
			n = splice_folio_into_pipe(pipe, folio, *ppos, n);
			if (!n)
				goto out;
			len -= n;
			total_spliced += n;
			*ppos += n;
			in->f_ra.prev_pos = *ppos;
			if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
				goto out;
		}

		folio_batch_release(&fbatch);
	} while (len);

out:
	folio_batch_release(&fbatch);
	file_accessed(in);

	return total_spliced ? total_spliced : error;
}
2920
EXPORT_SYMBOL(filemap_splice_read);
2921

2922 2923
static inline loff_t folio_seek_hole_data(struct xa_state *xas,
		struct address_space *mapping, struct folio *folio,
2924
		loff_t start, loff_t end, bool seek_data)
2925
{
2926 2927 2928
	const struct address_space_operations *ops = mapping->a_ops;
	size_t offset, bsz = i_blocksize(mapping->host);

2929
	if (xa_is_value(folio) || folio_test_uptodate(folio))
2930 2931 2932 2933 2934 2935
		return seek_data ? start : end;
	if (!ops->is_partially_uptodate)
		return seek_data ? end : start;

	xas_pause(xas);
	rcu_read_unlock();
2936 2937
	folio_lock(folio);
	if (unlikely(folio->mapping != mapping))
2938 2939
		goto unlock;

2940
	offset = offset_in_folio(folio, start) & ~(bsz - 1);
2941 2942

	do {
2943
		if (ops->is_partially_uptodate(folio, offset, bsz) ==
2944
							seek_data)
2945 2946 2947
			break;
		start = (start + bsz) & ~(bsz - 1);
		offset += bsz;
2948
	} while (offset < folio_size(folio));
2949
unlock:
2950
	folio_unlock(folio);
2951 2952
	rcu_read_lock();
	return start;
2953 2954
}

2955
static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
2956
{
2957
	if (xa_is_value(folio))
2958
		return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
2959
	return folio_size(folio);
2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974
}

/**
 * mapping_seek_hole_data - Seek for SEEK_DATA / SEEK_HOLE in the page cache.
 * @mapping: Address space to search.
 * @start: First byte to consider.
 * @end: Limit of search (exclusive).
 * @whence: Either SEEK_HOLE or SEEK_DATA.
 *
 * If the page cache knows which blocks contain holes and which blocks
 * contain data, your filesystem can use this function to implement
 * SEEK_HOLE and SEEK_DATA.  This is useful for filesystems which are
 * entirely memory-based such as tmpfs, and filesystems which support
 * unwritten extents.
 *
Ingo Molnar's avatar
Ingo Molnar committed
2975
 * Return: The requested offset on success, or -ENXIO if @whence specifies
2976 2977 2978 2979 2980 2981 2982 2983
 * SEEK_DATA and there is no data after @start.  There is an implicit hole
 * after @end - 1, so SEEK_HOLE returns @end if all the bytes between @start
 * and @end contain data.
 */
loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
		loff_t end, int whence)
{
	XA_STATE(xas, &mapping->i_pages, start >> PAGE_SHIFT);
2984
	pgoff_t max = (end - 1) >> PAGE_SHIFT;
2985
	bool seek_data = (whence == SEEK_DATA);
2986
	struct folio *folio;
2987 2988 2989 2990 2991

	if (end <= start)
		return -ENXIO;

	rcu_read_lock();
2992
	while ((folio = find_get_entry(&xas, max, XA_PRESENT))) {
2993
		loff_t pos = (u64)xas.xa_index << PAGE_SHIFT;
2994
		size_t seek_size;
2995 2996 2997 2998 2999 3000 3001

		if (start < pos) {
			if (!seek_data)
				goto unlock;
			start = pos;
		}

3002 3003 3004
		seek_size = seek_folio_size(&xas, folio);
		pos = round_up((u64)pos + 1, seek_size);
		start = folio_seek_hole_data(&xas, mapping, folio, start, pos,
3005 3006
				seek_data);
		if (start < pos)
3007
			goto unlock;
3008 3009 3010 3011
		if (start >= end)
			break;
		if (seek_size > PAGE_SIZE)
			xas_set(&xas, pos >> PAGE_SHIFT);
3012 3013
		if (!xa_is_value(folio))
			folio_put(folio);
3014 3015
	}
	if (seek_data)
3016
		start = -ENXIO;
3017 3018
unlock:
	rcu_read_unlock();
3019 3020
	if (folio && !xa_is_value(folio))
		folio_put(folio);
3021 3022 3023 3024 3025
	if (start > end)
		return end;
	return start;
}

Linus Torvalds's avatar
Linus Torvalds committed
3026 3027
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS  (100)
3028
/*
3029
 * lock_folio_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
3030
 * @vmf - the vm_fault for this fault.
3031
 * @folio - the folio to lock.
3032 3033
 * @fpin - the pointer to the file we may pin (or is already pinned).
 *
3034 3035 3036 3037 3038
 * This works similar to lock_folio_or_retry in that it can drop the
 * mmap_lock.  It differs in that it actually returns the folio locked
 * if it returns 1 and 0 if it couldn't lock the folio.  If we did have
 * to drop the mmap_lock then fpin will point to the pinned file and
 * needs to be fput()'ed at a later point.
3039
 */
3040
static int lock_folio_maybe_drop_mmap(struct vm_fault *vmf, struct folio *folio,
3041 3042
				     struct file **fpin)
{
3043
	if (folio_trylock(folio))
3044 3045
		return 1;

3046 3047
	/*
	 * NOTE! This will make us return with VM_FAULT_RETRY, but with
3048
	 * the fault lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
3049 3050
	 * is supposed to work. We have way too many special cases..
	 */
3051 3052 3053 3054 3055
	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
		return 0;

	*fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
	if (vmf->flags & FAULT_FLAG_KILLABLE) {
3056
		if (__folio_lock_killable(folio)) {
3057
			/*
3058 3059 3060 3061 3062
			 * We didn't have the right flags to drop the
			 * fault lock, but all fault_handlers only check
			 * for fatal signals if we return VM_FAULT_RETRY,
			 * so we need to drop the fault lock here and
			 * return 0 if we don't have a fpin.
3063 3064
			 */
			if (*fpin == NULL)
3065
				release_fault_lock(vmf);
3066 3067 3068
			return 0;
		}
	} else
3069 3070
		__folio_lock(folio);

3071 3072 3073
	return 1;
}

3074
/*
3075 3076 3077 3078 3079
 * Synchronous readahead happens when we don't even find a page in the page
 * cache at all.  We don't want to perform IO under the mmap sem, so if we have
 * to drop the mmap sem we return the file that was pinned in order for us to do
 * that.  If we didn't pin a file then we return NULL.  The file that is
 * returned needs to be fput()'ed when we're done with it.
3080
 */
3081
static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
3082
{
3083 3084
	struct file *file = vmf->vma->vm_file;
	struct file_ra_state *ra = &file->f_ra;
3085
	struct address_space *mapping = file->f_mapping;
3086
	DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff);
3087
	struct file *fpin = NULL;
3088
	unsigned long vm_flags = vmf->vma->vm_flags;
3089
	unsigned int mmap_miss;
3090

3091 3092
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
	/* Use the readahead code, even if readahead is disabled */
3093
	if (vm_flags & VM_HUGEPAGE) {
3094 3095 3096 3097 3098 3099 3100
		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
		ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
		ra->size = HPAGE_PMD_NR;
		/*
		 * Fetch two PMD folios, so we get the chance to actually
		 * readahead, unless we've been told not to.
		 */
3101
		if (!(vm_flags & VM_RAND_READ))
3102 3103 3104 3105 3106 3107 3108
			ra->size *= 2;
		ra->async_size = HPAGE_PMD_NR;
		page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
		return fpin;
	}
#endif

3109
	/* If we don't want any read-ahead, don't bother */
3110
	if (vm_flags & VM_RAND_READ)
3111
		return fpin;
3112
	if (!ra->ra_pages)
3113
		return fpin;
3114

3115
	if (vm_flags & VM_SEQ_READ) {
3116
		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3117
		page_cache_sync_ra(&ractl, ra->ra_pages);
3118
		return fpin;
3119 3120
	}

3121
	/* Avoid banging the cache line if not needed */
3122 3123 3124
	mmap_miss = READ_ONCE(ra->mmap_miss);
	if (mmap_miss < MMAP_LOTSAMISS * 10)
		WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
3125 3126 3127 3128 3129

	/*
	 * Do we miss much more than hit in this file? If so,
	 * stop bothering with read-ahead. It will only hurt.
	 */
3130
	if (mmap_miss > MMAP_LOTSAMISS)
3131
		return fpin;
3132

3133 3134 3135
	/*
	 * mmap read-around
	 */
3136
	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3137
	ra->start = max_t(long, 0, vmf->pgoff - ra->ra_pages / 2);
3138 3139
	ra->size = ra->ra_pages;
	ra->async_size = ra->ra_pages / 4;
3140
	ractl._index = ra->start;
3141
	page_cache_ra_order(&ractl, ra, 0);
3142
	return fpin;
3143 3144 3145 3146
}

/*
 * Asynchronous readahead happens when we find the page and PG_readahead,
3147
 * so we want to possibly extend the readahead further.  We return the file that
3148
 * was pinned if we have to drop the mmap_lock in order to do IO.
3149
 */
3150
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
3151
					    struct folio *folio)
3152
{
3153 3154
	struct file *file = vmf->vma->vm_file;
	struct file_ra_state *ra = &file->f_ra;
3155
	DEFINE_READAHEAD(ractl, file, ra, file->f_mapping, vmf->pgoff);
3156
	struct file *fpin = NULL;
3157
	unsigned int mmap_miss;
3158 3159

	/* If we don't want any read-ahead, don't bother */
3160
	if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
3161
		return fpin;
3162

3163 3164 3165
	mmap_miss = READ_ONCE(ra->mmap_miss);
	if (mmap_miss)
		WRITE_ONCE(ra->mmap_miss, --mmap_miss);
3166 3167

	if (folio_test_readahead(folio)) {
3168
		fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3169
		page_cache_async_ra(&ractl, folio, ra->ra_pages);
3170 3171
	}
	return fpin;
3172 3173
}

3174
/**
3175
 * filemap_fault - read in file data for page fault handling
Nick Piggin's avatar
Nick Piggin committed
3176
 * @vmf:	struct vm_fault containing details of the fault
3177
 *
3178
 * filemap_fault() is invoked via the vma operations vector for a
Linus Torvalds's avatar
Linus Torvalds committed
3179 3180 3181 3182 3183
 * mapped memory region to read in file data during a page fault.
 *
 * The goto's are kind of ugly, but this streamlines the normal case of having
 * it in the page cache, and handles the special cases reasonably without
 * having a lot of duplicated code.
3184
 *
3185
 * vma->vm_mm->mmap_lock must be held on entry.
3186
 *
3187
 * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
3188
 * may be dropped before doing I/O or by lock_folio_maybe_drop_mmap().
3189
 *
3190
 * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
3191 3192 3193
 * has not been released.
 *
 * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
3194 3195
 *
 * Return: bitwise-OR of %VM_FAULT_ codes.
Linus Torvalds's avatar
Linus Torvalds committed
3196
 */
3197
vm_fault_t filemap_fault(struct vm_fault *vmf)
Linus Torvalds's avatar
Linus Torvalds committed
3198 3199
{
	int error;
3200
	struct file *file = vmf->vma->vm_file;
3201
	struct file *fpin = NULL;
Linus Torvalds's avatar
Linus Torvalds committed
3202 3203
	struct address_space *mapping = file->f_mapping;
	struct inode *inode = mapping->host;
3204 3205
	pgoff_t max_idx, index = vmf->pgoff;
	struct folio *folio;
3206
	vm_fault_t ret = 0;
3207
	bool mapping_locked = false;
Linus Torvalds's avatar
Linus Torvalds committed
3208

3209 3210
	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
	if (unlikely(index >= max_idx))
3211
		return VM_FAULT_SIGBUS;
Linus Torvalds's avatar
Linus Torvalds committed
3212 3213

	/*
3214
	 * Do we have something in the page cache already?
Linus Torvalds's avatar
Linus Torvalds committed
3215
	 */
3216
	folio = filemap_get_folio(mapping, index);
3217
	if (likely(!IS_ERR(folio))) {
Linus Torvalds's avatar
Linus Torvalds committed
3218
		/*
3219 3220
		 * We found the page, so try async readahead before waiting for
		 * the lock.
Linus Torvalds's avatar
Linus Torvalds committed
3221
		 */
3222
		if (!(vmf->flags & FAULT_FLAG_TRIED))
3223
			fpin = do_async_mmap_readahead(vmf, folio);
3224
		if (unlikely(!folio_test_uptodate(folio))) {
3225 3226 3227 3228
			filemap_invalidate_lock_shared(mapping);
			mapping_locked = true;
		}
	} else {
3229 3230
		/* No page in the page cache at all */
		count_vm_event(PGMAJFAULT);
3231
		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
3232
		ret = VM_FAULT_MAJOR;
3233
		fpin = do_sync_mmap_readahead(vmf);
3234
retry_find:
3235
		/*
3236
		 * See comment in filemap_create_folio() why we need
3237 3238 3239 3240 3241 3242
		 * invalidate_lock
		 */
		if (!mapping_locked) {
			filemap_invalidate_lock_shared(mapping);
			mapping_locked = true;
		}
3243
		folio = __filemap_get_folio(mapping, index,
3244 3245
					  FGP_CREAT|FGP_FOR_MMAP,
					  vmf->gfp_mask);
3246
		if (IS_ERR(folio)) {
3247 3248
			if (fpin)
				goto out_retry;
3249
			filemap_invalidate_unlock_shared(mapping);
3250
			return VM_FAULT_OOM;
3251
		}
Linus Torvalds's avatar
Linus Torvalds committed
3252 3253
	}

3254
	if (!lock_folio_maybe_drop_mmap(vmf, folio, &fpin))
3255
		goto out_retry;
3256 3257

	/* Did it get truncated? */
3258 3259 3260
	if (unlikely(folio->mapping != mapping)) {
		folio_unlock(folio);
		folio_put(folio);
3261 3262
		goto retry_find;
	}
3263
	VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
3264

Linus Torvalds's avatar
Linus Torvalds committed
3265
	/*
3266 3267 3268
	 * We have a locked folio in the page cache, now we need to check
	 * that it's up-to-date. If not, it is going to be due to an error,
	 * or because readahead was otherwise unable to retrieve it.
Linus Torvalds's avatar
Linus Torvalds committed
3269
	 */
3270
	if (unlikely(!folio_test_uptodate(folio))) {
3271
		/*
3272 3273 3274 3275
		 * If the invalidate lock is not held, the folio was in cache
		 * and uptodate and now it is not. Strange but possible since we
		 * didn't hold the page lock all the time. Let's drop
		 * everything, get the invalidate lock and try again.
3276 3277
		 */
		if (!mapping_locked) {
3278 3279
			folio_unlock(folio);
			folio_put(folio);
3280 3281
			goto retry_find;
		}
3282 3283 3284 3285 3286 3287

		/*
		 * OK, the folio is really not uptodate. This can be because the
		 * VMA has the VM_RAND_READ flag set, or because an error
		 * arose. Let's read it in directly.
		 */
Linus Torvalds's avatar
Linus Torvalds committed
3288
		goto page_not_uptodate;
3289
	}
Linus Torvalds's avatar
Linus Torvalds committed
3290

3291
	/*
3292
	 * We've made it this far and we had to drop our mmap_lock, now is the
3293 3294 3295 3296
	 * time to return to the upper layer and have it re-find the vma and
	 * redo the fault.
	 */
	if (fpin) {
3297
		folio_unlock(folio);
3298 3299
		goto out_retry;
	}
3300 3301
	if (mapping_locked)
		filemap_invalidate_unlock_shared(mapping);
3302

3303 3304 3305 3306
	/*
	 * Found the page and have a reference on it.
	 * We must recheck i_size under page lock.
	 */
3307 3308 3309 3310
	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
	if (unlikely(index >= max_idx)) {
		folio_unlock(folio);
		folio_put(folio);
3311
		return VM_FAULT_SIGBUS;
3312 3313
	}

3314
	vmf->page = folio_file_page(folio, index);
Nick Piggin's avatar
Nick Piggin committed
3315
	return ret | VM_FAULT_LOCKED;
Linus Torvalds's avatar
Linus Torvalds committed
3316 3317 3318 3319 3320 3321 3322 3323

page_not_uptodate:
	/*
	 * Umm, take care of errors if the page isn't up-to-date.
	 * Try to re-read it _once_. We do this synchronously,
	 * because there really aren't any performance issues here
	 * and we need to check for errors.
	 */
3324
	fpin = maybe_unlock_mmap_for_io(vmf, fpin);
3325
	error = filemap_read_folio(file, mapping->a_ops->read_folio, folio);
3326 3327
	if (fpin)
		goto out_retry;
3328
	folio_put(folio);
3329 3330

	if (!error || error == AOP_TRUNCATED_PAGE)
3331
		goto retry_find;
3332
	filemap_invalidate_unlock_shared(mapping);
Linus Torvalds's avatar
Linus Torvalds committed
3333

Nick Piggin's avatar
Nick Piggin committed
3334
	return VM_FAULT_SIGBUS;
3335 3336 3337

out_retry:
	/*
3338
	 * We dropped the mmap_lock, we need to return to the fault handler to
3339 3340 3341
	 * re-find the vma and come back and find our hopefully still populated
	 * page.
	 */
3342
	if (!IS_ERR(folio))
3343
		folio_put(folio);
3344 3345
	if (mapping_locked)
		filemap_invalidate_unlock_shared(mapping);
3346 3347 3348
	if (fpin)
		fput(fpin);
	return ret | VM_FAULT_RETRY;
3349 3350 3351
}
EXPORT_SYMBOL(filemap_fault);

3352 3353
static bool filemap_map_pmd(struct vm_fault *vmf, struct folio *folio,
		pgoff_t start)
3354
{
3355 3356 3357 3358
	struct mm_struct *mm = vmf->vma->vm_mm;

	/* Huge page is mapped? No need to proceed. */
	if (pmd_trans_huge(*vmf->pmd)) {
3359 3360
		folio_unlock(folio);
		folio_put(folio);
3361 3362 3363
		return true;
	}

3364 3365
	if (pmd_none(*vmf->pmd) && folio_test_pmd_mappable(folio)) {
		struct page *page = folio_file_page(folio, start);
3366 3367 3368
		vm_fault_t ret = do_set_pmd(vmf, page);
		if (!ret) {
			/* The page is mapped successfully, reference consumed. */
3369
			folio_unlock(folio);
3370
			return true;
3371 3372 3373
		}
	}

3374 3375
	if (pmd_none(*vmf->pmd))
		pmd_install(mm, vmf->pmd, &vmf->prealloc_pte);
3376 3377 3378 3379

	return false;
}

3380 3381
static struct folio *next_uptodate_folio(struct xa_state *xas,
		struct address_space *mapping, pgoff_t end_pgoff)
3382
{
3383
	struct folio *folio = xas_next_entry(xas, end_pgoff);
3384 3385 3386
	unsigned long max_idx;

	do {
3387
		if (!folio)
3388
			return NULL;
3389
		if (xas_retry(xas, folio))
3390
			continue;
3391
		if (xa_is_value(folio))
3392
			continue;
3393
		if (folio_test_locked(folio))
3394
			continue;
3395
		if (!folio_try_get_rcu(folio))
3396 3397
			continue;
		/* Has the page moved or been split? */
3398
		if (unlikely(folio != xas_reload(xas)))
3399
			goto skip;
3400
		if (!folio_test_uptodate(folio) || folio_test_readahead(folio))
3401
			goto skip;
3402
		if (!folio_trylock(folio))
3403
			goto skip;
3404
		if (folio->mapping != mapping)
3405
			goto unlock;
3406
		if (!folio_test_uptodate(folio))
3407 3408 3409 3410
			goto unlock;
		max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
		if (xas->xa_index >= max_idx)
			goto unlock;
3411
		return folio;
3412
unlock:
3413
		folio_unlock(folio);
3414
skip:
3415 3416
		folio_put(folio);
	} while ((folio = xas_next_entry(xas, end_pgoff)) != NULL);
3417 3418 3419 3420

	return NULL;
}

3421 3422 3423 3424 3425 3426
/*
 * Map page range [start_page, start_page + nr_pages) of folio.
 * start_page is gotten from start by folio_page(folio, start)
 */
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
			struct folio *folio, unsigned long start,
3427 3428
			unsigned long addr, unsigned int nr_pages,
			unsigned int *mmap_miss)
3429
{
3430 3431
	vm_fault_t ret = 0;
	struct page *page = folio_page(folio, start);
Yin Fengwei's avatar
Yin Fengwei committed
3432 3433
	unsigned int count = 0;
	pte_t *old_ptep = vmf->pte;
3434

3435
	do {
Yin Fengwei's avatar
Yin Fengwei committed
3436 3437
		if (PageHWPoison(page + count))
			goto skip;
3438

3439
		(*mmap_miss)++;
3440 3441 3442 3443 3444 3445

		/*
		 * NOTE: If there're PTE markers, we'll leave them to be
		 * handled in the specific fault path, and it'll prohibit the
		 * fault-around logic.
		 */
3446
		if (!pte_none(ptep_get(&vmf->pte[count])))
Yin Fengwei's avatar
Yin Fengwei committed
3447
			goto skip;
3448

Yin Fengwei's avatar
Yin Fengwei committed
3449 3450 3451 3452 3453 3454
		count++;
		continue;
skip:
		if (count) {
			set_pte_range(vmf, folio, page, count, addr);
			folio_ref_add(folio, count);
3455
			if (in_range(vmf->address, addr, count * PAGE_SIZE))
Yin Fengwei's avatar
Yin Fengwei committed
3456 3457
				ret = VM_FAULT_NOPAGE;
		}
3458

Yin Fengwei's avatar
Yin Fengwei committed
3459 3460 3461 3462 3463 3464 3465 3466 3467 3468
		count++;
		page += count;
		vmf->pte += count;
		addr += count * PAGE_SIZE;
		count = 0;
	} while (--nr_pages > 0);

	if (count) {
		set_pte_range(vmf, folio, page, count, addr);
		folio_ref_add(folio, count);
3469
		if (in_range(vmf->address, addr, count * PAGE_SIZE))
Yin Fengwei's avatar
Yin Fengwei committed
3470 3471
			ret = VM_FAULT_NOPAGE;
	}
3472

Yin Fengwei's avatar
Yin Fengwei committed
3473
	vmf->pte = old_ptep;
3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502

	return ret;
}

static vm_fault_t filemap_map_order0_folio(struct vm_fault *vmf,
		struct folio *folio, unsigned long addr,
		unsigned int *mmap_miss)
{
	vm_fault_t ret = 0;
	struct page *page = &folio->page;

	if (PageHWPoison(page))
		return ret;

	(*mmap_miss)++;

	/*
	 * NOTE: If there're PTE markers, we'll leave them to be
	 * handled in the specific fault path, and it'll prohibit
	 * the fault-around logic.
	 */
	if (!pte_none(ptep_get(vmf->pte)))
		return ret;

	if (vmf->address == addr)
		ret = VM_FAULT_NOPAGE;

	set_pte_range(vmf, folio, page, 1, addr);
	folio_ref_inc(folio);
3503 3504

	return ret;
3505 3506 3507 3508 3509 3510 3511
}

vm_fault_t filemap_map_pages(struct vm_fault *vmf,
			     pgoff_t start_pgoff, pgoff_t end_pgoff)
{
	struct vm_area_struct *vma = vmf->vma;
	struct file *file = vma->vm_file;
3512
	struct address_space *mapping = file->f_mapping;
3513
	pgoff_t last_pgoff = start_pgoff;
3514
	unsigned long addr;
3515
	XA_STATE(xas, &mapping->i_pages, start_pgoff);
3516
	struct folio *folio;
3517
	vm_fault_t ret = 0;
3518
	unsigned int nr_pages = 0, mmap_miss = 0, mmap_miss_saved;
3519 3520

	rcu_read_lock();
3521
	folio = next_uptodate_folio(&xas, mapping, end_pgoff);
3522
	if (!folio)
3523
		goto out;
3524

3525
	if (filemap_map_pmd(vmf, folio, start_pgoff)) {
3526 3527 3528
		ret = VM_FAULT_NOPAGE;
		goto out;
	}
3529

3530 3531
	addr = vma->vm_start + ((start_pgoff - vma->vm_pgoff) << PAGE_SHIFT);
	vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, addr, &vmf->ptl);
3532 3533 3534 3535 3536
	if (!vmf->pte) {
		folio_unlock(folio);
		folio_put(folio);
		goto out;
	}
3537
	do {
3538
		unsigned long end;
3539

3540
		addr += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
3541
		vmf->pte += xas.xa_index - last_pgoff;
3542
		last_pgoff = xas.xa_index;
3543
		end = folio_next_index(folio) - 1;
3544
		nr_pages = min(end, end_pgoff) - xas.xa_index + 1;
3545

3546 3547 3548 3549 3550 3551 3552
		if (!folio_test_large(folio))
			ret |= filemap_map_order0_folio(vmf,
					folio, addr, &mmap_miss);
		else
			ret |= filemap_map_folio_range(vmf, folio,
					xas.xa_index - folio->index, addr,
					nr_pages, &mmap_miss);
3553

3554 3555
		folio_unlock(folio);
		folio_put(folio);
3556
	} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
3557 3558
	pte_unmap_unlock(vmf->pte, vmf->ptl);
out:
3559
	rcu_read_unlock();
3560 3561 3562 3563 3564 3565 3566

	mmap_miss_saved = READ_ONCE(file->f_ra.mmap_miss);
	if (mmap_miss >= mmap_miss_saved)
		WRITE_ONCE(file->f_ra.mmap_miss, 0);
	else
		WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss_saved - mmap_miss);

3567
	return ret;
3568 3569 3570
}
EXPORT_SYMBOL(filemap_map_pages);

3571
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3572
{
3573
	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
3574
	struct folio *folio = page_folio(vmf->page);
3575
	vm_fault_t ret = VM_FAULT_LOCKED;
3576

3577
	sb_start_pagefault(mapping->host->i_sb);
3578
	file_update_time(vmf->vma->vm_file);
3579 3580 3581
	folio_lock(folio);
	if (folio->mapping != mapping) {
		folio_unlock(folio);
3582 3583 3584
		ret = VM_FAULT_NOPAGE;
		goto out;
	}
3585
	/*
3586
	 * We mark the folio dirty already here so that when freeze is in
3587
	 * progress, we are guaranteed that writeback during freezing will
3588
	 * see the dirty folio and writeprotect it again.
3589
	 */
3590 3591
	folio_mark_dirty(folio);
	folio_wait_stable(folio);
3592
out:
3593
	sb_end_pagefault(mapping->host->i_sb);
3594 3595 3596
	return ret;
}

3597
const struct vm_operations_struct generic_file_vm_ops = {
3598
	.fault		= filemap_fault,
3599
	.map_pages	= filemap_map_pages,
3600
	.page_mkwrite	= filemap_page_mkwrite,
Linus Torvalds's avatar
Linus Torvalds committed
3601 3602 3603 3604
};

/* This is used for a general mmap of a disk file */

3605
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
3606 3607 3608
{
	struct address_space *mapping = file->f_mapping;

3609
	if (!mapping->a_ops->read_folio)
Linus Torvalds's avatar
Linus Torvalds committed
3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620
		return -ENOEXEC;
	file_accessed(file);
	vma->vm_ops = &generic_file_vm_ops;
	return 0;
}

/*
 * This is for filesystems which do not implement ->writepage.
 */
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
{
3621
	if (vma_is_shared_maywrite(vma))
Linus Torvalds's avatar
Linus Torvalds committed
3622 3623 3624 3625
		return -EINVAL;
	return generic_file_mmap(file, vma);
}
#else
3626
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
3627
{
3628
	return VM_FAULT_SIGBUS;
3629
}
3630
int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
3631 3632 3633
{
	return -ENOSYS;
}
3634
int generic_file_readonly_mmap(struct file *file, struct vm_area_struct *vma)
Linus Torvalds's avatar
Linus Torvalds committed
3635 3636 3637 3638 3639
{
	return -ENOSYS;
}
#endif /* CONFIG_MMU */

3640
EXPORT_SYMBOL(filemap_page_mkwrite);
Linus Torvalds's avatar
Linus Torvalds committed
3641 3642 3643
EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);

3644
static struct folio *do_read_cache_folio(struct address_space *mapping,
3645
		pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
3646
{
3647
	struct folio *folio;
Linus Torvalds's avatar
Linus Torvalds committed
3648
	int err;
3649 3650 3651

	if (!filler)
		filler = mapping->a_ops->read_folio;
Linus Torvalds's avatar
Linus Torvalds committed
3652
repeat:
3653
	folio = filemap_get_folio(mapping, index);
3654
	if (IS_ERR(folio)) {
3655 3656
		folio = filemap_alloc_folio(gfp, 0);
		if (!folio)
Nick Piggin's avatar
Nick Piggin committed
3657
			return ERR_PTR(-ENOMEM);
3658
		err = filemap_add_folio(mapping, folio, index, gfp);
Nick Piggin's avatar
Nick Piggin committed
3659
		if (unlikely(err)) {
3660
			folio_put(folio);
Nick Piggin's avatar
Nick Piggin committed
3661 3662
			if (err == -EEXIST)
				goto repeat;
3663
			/* Presumably ENOMEM for xarray node */
Linus Torvalds's avatar
Linus Torvalds committed
3664 3665
			return ERR_PTR(err);
		}
3666

3667
		goto filler;
3668
	}
3669
	if (folio_test_uptodate(folio))
Linus Torvalds's avatar
Linus Torvalds committed
3670 3671
		goto out;

3672 3673 3674 3675
	if (!folio_trylock(folio)) {
		folio_put_wait_locked(folio, TASK_UNINTERRUPTIBLE);
		goto repeat;
	}
3676

3677
	/* Folio was truncated from mapping */
3678 3679 3680
	if (!folio->mapping) {
		folio_unlock(folio);
		folio_put(folio);
3681
		goto repeat;
Linus Torvalds's avatar
Linus Torvalds committed
3682
	}
3683 3684

	/* Someone else locked and filled the page in a very small window */
3685 3686
	if (folio_test_uptodate(folio)) {
		folio_unlock(folio);
Linus Torvalds's avatar
Linus Torvalds committed
3687 3688
		goto out;
	}
3689

3690
filler:
3691
	err = filemap_read_folio(file, filler, folio);
3692
	if (err) {
3693
		folio_put(folio);
3694 3695
		if (err == AOP_TRUNCATED_PAGE)
			goto repeat;
3696 3697
		return ERR_PTR(err);
	}
3698

3699
out:
3700 3701
	folio_mark_accessed(folio);
	return folio;
3702
}
3703 3704

/**
3705 3706 3707 3708 3709
 * read_cache_folio - Read into page cache, fill it if needed.
 * @mapping: The address_space to read from.
 * @index: The index to read.
 * @filler: Function to perform the read, or NULL to use aops->read_folio().
 * @file: Passed to filler function, may be NULL if not required.
3710
 *
3711 3712
 * Read one page into the page cache.  If it succeeds, the folio returned
 * will contain @index, but it may not be the first page of the folio.
3713
 *
3714 3715
 * If the filler function returns an error, it will be returned to the
 * caller.
3716
 *
3717 3718
 * Context: May sleep.  Expects mapping->invalidate_lock to be held.
 * Return: An uptodate folio on success, ERR_PTR() on failure.
3719
 */
3720
struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
3721
		filler_t filler, struct file *file)
3722
{
3723
	return do_read_cache_folio(mapping, index, filler, file,
3724 3725 3726 3727
			mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_folio);

3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751
/**
 * mapping_read_folio_gfp - Read into page cache, using specified allocation flags.
 * @mapping:	The address_space for the folio.
 * @index:	The index that the allocated folio will contain.
 * @gfp:	The page allocator flags to use if allocating.
 *
 * This is the same as "read_cache_folio(mapping, index, NULL, NULL)", but with
 * any new memory allocations done using the specified allocation flags.
 *
 * The most likely error from this function is EIO, but ENOMEM is
 * possible and so is EINTR.  If ->read_folio returns another error,
 * that will be returned to the caller.
 *
 * The function expects mapping->invalidate_lock to be already held.
 *
 * Return: Uptodate folio on success, ERR_PTR() on failure.
 */
struct folio *mapping_read_folio_gfp(struct address_space *mapping,
		pgoff_t index, gfp_t gfp)
{
	return do_read_cache_folio(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(mapping_read_folio_gfp);

3752
static struct page *do_read_cache_page(struct address_space *mapping,
3753
		pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
3754 3755 3756
{
	struct folio *folio;

3757
	folio = do_read_cache_folio(mapping, index, filler, file, gfp);
3758 3759 3760 3761 3762
	if (IS_ERR(folio))
		return &folio->page;
	return folio_file_page(folio, index);
}

3763
struct page *read_cache_page(struct address_space *mapping,
3764
			pgoff_t index, filler_t *filler, struct file *file)
3765
{
3766
	return do_read_cache_page(mapping, index, filler, file,
3767
			mapping_gfp_mask(mapping));
3768
}
3769
EXPORT_SYMBOL(read_cache_page);
3770 3771 3772 3773 3774 3775 3776 3777

/**
 * read_cache_page_gfp - read into page cache, using specified page allocation flags.
 * @mapping:	the page's address_space
 * @index:	the page index
 * @gfp:	the page allocator flags to use if allocating
 *
 * This is the same as "read_mapping_page(mapping, index, NULL)", but with
3778
 * any new page allocations done using the specified allocation flags.
3779 3780
 *
 * If the page does not get brought uptodate, return -EIO.
3781
 *
3782 3783
 * The function expects mapping->invalidate_lock to be already held.
 *
3784
 * Return: up to date page on success, ERR_PTR() on failure.
3785 3786 3787 3788 3789
 */
struct page *read_cache_page_gfp(struct address_space *mapping,
				pgoff_t index,
				gfp_t gfp)
{
3790
	return do_read_cache_page(mapping, index, NULL, NULL, gfp);
3791 3792 3793
}
EXPORT_SYMBOL(read_cache_page_gfp);

3794 3795 3796
/*
 * Warn about a page cache invalidation failure during a direct I/O write.
 */
3797
static void dio_warn_stale_pagecache(struct file *filp)
3798 3799 3800 3801 3802
{
	static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
	char pathname[128];
	char *path;

3803
	errseq_set(&filp->f_mapping->wb_err, -EIO);
3804 3805 3806 3807 3808 3809 3810 3811 3812 3813
	if (__ratelimit(&_rs)) {
		path = file_path(filp, pathname, sizeof(pathname));
		if (IS_ERR(path))
			path = "(unknown)";
		pr_crit("Page cache invalidation failure on direct I/O.  Possible data corruption due to collision with buffered I/O!\n");
		pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid,
			current->comm);
	}
}

3814
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count)
Linus Torvalds's avatar
Linus Torvalds committed
3815
{
3816
	struct address_space *mapping = iocb->ki_filp->f_mapping;
Linus Torvalds's avatar
Linus Torvalds committed
3817

3818 3819 3820 3821 3822 3823
	if (mapping->nrpages &&
	    invalidate_inode_pages2_range(mapping,
			iocb->ki_pos >> PAGE_SHIFT,
			(iocb->ki_pos + count - 1) >> PAGE_SHIFT))
		dio_warn_stale_pagecache(iocb->ki_filp);
}
3824

Linus Torvalds's avatar
Linus Torvalds committed
3825
ssize_t
3826
generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
Linus Torvalds's avatar
Linus Torvalds committed
3827
{
3828 3829 3830
	struct address_space *mapping = iocb->ki_filp->f_mapping;
	size_t write_len = iov_iter_count(from);
	ssize_t written;
3831

3832 3833 3834 3835
	/*
	 * If a page can not be invalidated, return 0 to fall back
	 * to buffered write.
	 */
3836
	written = kiocb_invalidate_pages(iocb, write_len);
3837 3838 3839
	if (written) {
		if (written == -EBUSY)
			return 0;
3840
		return written;
3841 3842
	}

3843
	written = mapping->a_ops->direct_IO(iocb, from);
3844 3845 3846 3847 3848 3849 3850 3851

	/*
	 * Finally, try again to invalidate clean pages which might have been
	 * cached by non-direct readahead, or faulted in by get_user_pages()
	 * if the source of the write was an mmap'ed region of the file
	 * we're writing.  Either one is a pretty crazy thing to do,
	 * so we don't support it 100%.  If this invalidation
	 * fails, tough, the write still worked...
3852 3853 3854 3855
	 *
	 * Most of the time we do not need this since dio_complete() will do
	 * the invalidation for us. However there are some file systems that
	 * do not end up with dio_complete() being called, so let's not break
3856 3857
	 * them by removing it completely.
	 *
3858 3859
	 * Noticeable example is a blkdev_direct_IO().
	 *
3860
	 * Skip invalidation for async writes or if mapping has no pages.
3861
	 */
Linus Torvalds's avatar
Linus Torvalds committed
3862
	if (written > 0) {
3863 3864 3865 3866
		struct inode *inode = mapping->host;
		loff_t pos = iocb->ki_pos;

		kiocb_invalidate_post_direct_write(iocb, written);
3867
		pos += written;
3868
		write_len -= written;
3869 3870
		if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) {
			i_size_write(inode, pos);
Linus Torvalds's avatar
Linus Torvalds committed
3871 3872
			mark_inode_dirty(inode);
		}
3873
		iocb->ki_pos = pos;
Linus Torvalds's avatar
Linus Torvalds committed
3874
	}
3875 3876
	if (written != -EIOCBQUEUED)
		iov_iter_revert(from, write_len - iov_iter_count(from));
Linus Torvalds's avatar
Linus Torvalds committed
3877 3878 3879 3880
	return written;
}
EXPORT_SYMBOL(generic_file_direct_write);

3881
ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
3882
{
3883 3884
	struct file *file = iocb->ki_filp;
	loff_t pos = iocb->ki_pos;
3885 3886 3887 3888
	struct address_space *mapping = file->f_mapping;
	const struct address_space_operations *a_ops = mapping->a_ops;
	long status = 0;
	ssize_t written = 0;
3889

3890 3891 3892 3893 3894
	do {
		struct page *page;
		unsigned long offset;	/* Offset into pagecache page */
		unsigned long bytes;	/* Bytes to write to page */
		size_t copied;		/* Bytes copied from user */
3895
		void *fsdata = NULL;
3896

3897 3898
		offset = (pos & (PAGE_SIZE - 1));
		bytes = min_t(unsigned long, PAGE_SIZE - offset,
3899 3900 3901
						iov_iter_count(i));

again:
3902 3903 3904 3905 3906 3907
		/*
		 * Bring in the user page that we will copy from _first_.
		 * Otherwise there's a nasty deadlock on copying from the
		 * same page as we're writing to, without it being marked
		 * up-to-date.
		 */
3908
		if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
3909 3910 3911 3912
			status = -EFAULT;
			break;
		}

Jan Kara's avatar
Jan Kara committed
3913 3914 3915 3916 3917
		if (fatal_signal_pending(current)) {
			status = -EINTR;
			break;
		}

3918
		status = a_ops->write_begin(file, mapping, pos, bytes,
3919
						&page, &fsdata);
3920
		if (unlikely(status < 0))
3921 3922
			break;

3923 3924
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);
3925

3926
		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
3927 3928 3929 3930
		flush_dcache_page(page);

		status = a_ops->write_end(file, mapping, pos, bytes, copied,
						page, fsdata);
3931 3932 3933 3934 3935
		if (unlikely(status != copied)) {
			iov_iter_revert(i, copied - max(status, 0L));
			if (unlikely(status < 0))
				break;
		}
3936 3937
		cond_resched();

3938
		if (unlikely(status == 0)) {
3939
			/*
3940 3941 3942 3943
			 * A short copy made ->write_end() reject the
			 * thing entirely.  Might be memory poisoning
			 * halfway through, might be a race with munmap,
			 * might be severe memory pressure.
3944
			 */
3945 3946
			if (copied)
				bytes = copied;
3947 3948
			goto again;
		}
3949 3950
		pos += status;
		written += status;
3951 3952 3953 3954

		balance_dirty_pages_ratelimited(mapping);
	} while (iov_iter_count(i));

3955 3956 3957 3958
	if (!written)
		return status;
	iocb->ki_pos += written;
	return written;
3959
}
3960
EXPORT_SYMBOL(generic_perform_write);
Linus Torvalds's avatar
Linus Torvalds committed
3961

3962
/**
3963
 * __generic_file_write_iter - write data to a file
3964
 * @iocb:	IO state structure (file, offset, etc.)
3965
 * @from:	iov_iter with data to write
3966 3967 3968 3969 3970 3971
 *
 * This function does all the work needed for actually writing data to a
 * file. It does all basic checks, removes SUID from the file, updates
 * modification times and calls proper subroutines depending on whether we
 * do direct IO or a standard buffered write.
 *
3972
 * It expects i_rwsem to be grabbed unless we work on a block device or similar
3973 3974 3975 3976
 * object which does not need locking at all.
 *
 * This function does *not* take care of syncing data in case of O_SYNC write.
 * A caller has to handle it. This is mainly due to the fact that we want to
3977
 * avoid syncing under i_rwsem.
3978 3979 3980 3981
 *
 * Return:
 * * number of bytes written, even for truncated writes
 * * negative error code if no data has been written at all
3982
 */
3983
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
Linus Torvalds's avatar
Linus Torvalds committed
3984 3985
{
	struct file *file = iocb->ki_filp;
3986
	struct address_space *mapping = file->f_mapping;
3987 3988
	struct inode *inode = mapping->host;
	ssize_t ret;
Linus Torvalds's avatar
Linus Torvalds committed
3989

3990 3991 3992
	ret = file_remove_privs(file);
	if (ret)
		return ret;
Linus Torvalds's avatar
Linus Torvalds committed
3993

3994 3995 3996
	ret = file_update_time(file);
	if (ret)
		return ret;
3997

3998
	if (iocb->ki_flags & IOCB_DIRECT) {
3999
		ret = generic_file_direct_write(iocb, from);
Linus Torvalds's avatar
Linus Torvalds committed
4000
		/*
4001 4002 4003 4004 4005
		 * If the write stopped short of completing, fall back to
		 * buffered writes.  Some filesystems do this for writes to
		 * holes, for example.  For DAX files, a buffered write will
		 * not succeed (even if it did, DAX does not handle dirty
		 * page-cache pages correctly).
Linus Torvalds's avatar
Linus Torvalds committed
4006
		 */
4007 4008 4009 4010
		if (ret < 0 || !iov_iter_count(from) || IS_DAX(inode))
			return ret;
		return direct_write_fallback(iocb, from, ret,
				generic_perform_write(iocb, from));
4011
	}
4012 4013

	return generic_perform_write(iocb, from);
Linus Torvalds's avatar
Linus Torvalds committed
4014
}
4015
EXPORT_SYMBOL(__generic_file_write_iter);
4016 4017

/**
4018
 * generic_file_write_iter - write data to a file
4019
 * @iocb:	IO state structure
4020
 * @from:	iov_iter with data to write
4021
 *
4022
 * This is a wrapper around __generic_file_write_iter() to be used by most
4023
 * filesystems. It takes care of syncing the file in case of O_SYNC file
4024
 * and acquires i_rwsem as needed.
4025 4026 4027 4028
 * Return:
 * * negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * number of bytes written, even for truncated writes
4029
 */
4030
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
Linus Torvalds's avatar
Linus Torvalds committed
4031 4032
{
	struct file *file = iocb->ki_filp;
4033
	struct inode *inode = file->f_mapping->host;
Linus Torvalds's avatar
Linus Torvalds committed
4034 4035
	ssize_t ret;

Al Viro's avatar
Al Viro committed
4036
	inode_lock(inode);
4037 4038
	ret = generic_write_checks(iocb, from);
	if (ret > 0)
4039
		ret = __generic_file_write_iter(iocb, from);
Al Viro's avatar
Al Viro committed
4040
	inode_unlock(inode);
Linus Torvalds's avatar
Linus Torvalds committed
4041

4042 4043
	if (ret > 0)
		ret = generic_write_sync(iocb, ret);
Linus Torvalds's avatar
Linus Torvalds committed
4044 4045
	return ret;
}
4046
EXPORT_SYMBOL(generic_file_write_iter);
Linus Torvalds's avatar
Linus Torvalds committed
4047

4048
/**
4049 4050 4051
 * filemap_release_folio() - Release fs-specific metadata on a folio.
 * @folio: The folio which the kernel is trying to free.
 * @gfp: Memory allocation flags (and I/O mode).
4052
 *
4053 4054
 * The address_space is trying to release any data attached to a folio
 * (presumably at folio->private).
4055
 *
4056 4057
 * This will also be called if the private_2 flag is set on a page,
 * indicating that the folio has other metadata associated with it.
4058
 *
4059 4060 4061
 * The @gfp argument specifies whether I/O may be performed to release
 * this page (__GFP_IO), and whether the call may block
 * (__GFP_RECLAIM & __GFP_FS).
4062
 *
4063
 * Return: %true if the release was successful, otherwise %false.
4064
 */
4065
bool filemap_release_folio(struct folio *folio, gfp_t gfp)
4066
{
4067
	struct address_space * const mapping = folio->mapping;
4068

4069
	BUG_ON(!folio_test_locked(folio));
4070 4071
	if (!folio_needs_release(folio))
		return true;
4072 4073
	if (folio_test_writeback(folio))
		return false;
4074

4075 4076
	if (mapping && mapping->a_ops->release_folio)
		return mapping->a_ops->release_folio(folio, gfp);
4077
	return try_to_free_buffers(folio);
4078
}
4079
EXPORT_SYMBOL(filemap_release_folio);
4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247

#ifdef CONFIG_CACHESTAT_SYSCALL
/**
 * filemap_cachestat() - compute the page cache statistics of a mapping
 * @mapping:	The mapping to compute the statistics for.
 * @first_index:	The starting page cache index.
 * @last_index:	The final page index (inclusive).
 * @cs:	the cachestat struct to write the result to.
 *
 * This will query the page cache statistics of a mapping in the
 * page range of [first_index, last_index] (inclusive). The statistics
 * queried include: number of dirty pages, number of pages marked for
 * writeback, and the number of (recently) evicted pages.
 */
static void filemap_cachestat(struct address_space *mapping,
		pgoff_t first_index, pgoff_t last_index, struct cachestat *cs)
{
	XA_STATE(xas, &mapping->i_pages, first_index);
	struct folio *folio;

	rcu_read_lock();
	xas_for_each(&xas, folio, last_index) {
		unsigned long nr_pages;
		pgoff_t folio_first_index, folio_last_index;

		if (xas_retry(&xas, folio))
			continue;

		if (xa_is_value(folio)) {
			/* page is evicted */
			void *shadow = (void *)folio;
			bool workingset; /* not used */
			int order = xa_get_order(xas.xa, xas.xa_index);

			nr_pages = 1 << order;
			folio_first_index = round_down(xas.xa_index, 1 << order);
			folio_last_index = folio_first_index + nr_pages - 1;

			/* Folios might straddle the range boundaries, only count covered pages */
			if (folio_first_index < first_index)
				nr_pages -= first_index - folio_first_index;

			if (folio_last_index > last_index)
				nr_pages -= folio_last_index - last_index;

			cs->nr_evicted += nr_pages;

#ifdef CONFIG_SWAP /* implies CONFIG_MMU */
			if (shmem_mapping(mapping)) {
				/* shmem file - in swap cache */
				swp_entry_t swp = radix_to_swp_entry(folio);

				shadow = get_shadow_from_swap_cache(swp);
			}
#endif
			if (workingset_test_recent(shadow, true, &workingset))
				cs->nr_recently_evicted += nr_pages;

			goto resched;
		}

		nr_pages = folio_nr_pages(folio);
		folio_first_index = folio_pgoff(folio);
		folio_last_index = folio_first_index + nr_pages - 1;

		/* Folios might straddle the range boundaries, only count covered pages */
		if (folio_first_index < first_index)
			nr_pages -= first_index - folio_first_index;

		if (folio_last_index > last_index)
			nr_pages -= folio_last_index - last_index;

		/* page is in cache */
		cs->nr_cache += nr_pages;

		if (folio_test_dirty(folio))
			cs->nr_dirty += nr_pages;

		if (folio_test_writeback(folio))
			cs->nr_writeback += nr_pages;

resched:
		if (need_resched()) {
			xas_pause(&xas);
			cond_resched_rcu();
		}
	}
	rcu_read_unlock();
}

/*
 * The cachestat(2) system call.
 *
 * cachestat() returns the page cache statistics of a file in the
 * bytes range specified by `off` and `len`: number of cached pages,
 * number of dirty pages, number of pages marked for writeback,
 * number of evicted pages, and number of recently evicted pages.
 *
 * An evicted page is a page that is previously in the page cache
 * but has been evicted since. A page is recently evicted if its last
 * eviction was recent enough that its reentry to the cache would
 * indicate that it is actively being used by the system, and that
 * there is memory pressure on the system.
 *
 * `off` and `len` must be non-negative integers. If `len` > 0,
 * the queried range is [`off`, `off` + `len`]. If `len` == 0,
 * we will query in the range from `off` to the end of the file.
 *
 * The `flags` argument is unused for now, but is included for future
 * extensibility. User should pass 0 (i.e no flag specified).
 *
 * Currently, hugetlbfs is not supported.
 *
 * Because the status of a page can change after cachestat() checks it
 * but before it returns to the application, the returned values may
 * contain stale information.
 *
 * return values:
 *  zero        - success
 *  -EFAULT     - cstat or cstat_range points to an illegal address
 *  -EINVAL     - invalid flags
 *  -EBADF      - invalid file descriptor
 *  -EOPNOTSUPP - file descriptor is of a hugetlbfs file
 */
SYSCALL_DEFINE4(cachestat, unsigned int, fd,
		struct cachestat_range __user *, cstat_range,
		struct cachestat __user *, cstat, unsigned int, flags)
{
	struct fd f = fdget(fd);
	struct address_space *mapping;
	struct cachestat_range csr;
	struct cachestat cs;
	pgoff_t first_index, last_index;

	if (!f.file)
		return -EBADF;

	if (copy_from_user(&csr, cstat_range,
			sizeof(struct cachestat_range))) {
		fdput(f);
		return -EFAULT;
	}

	/* hugetlbfs is not supported */
	if (is_file_hugepages(f.file)) {
		fdput(f);
		return -EOPNOTSUPP;
	}

	if (flags != 0) {
		fdput(f);
		return -EINVAL;
	}

	first_index = csr.off >> PAGE_SHIFT;
	last_index =
		csr.len == 0 ? ULONG_MAX : (csr.off + csr.len - 1) >> PAGE_SHIFT;
	memset(&cs, 0, sizeof(struct cachestat));
	mapping = f.file->f_mapping;
	filemap_cachestat(mapping, first_index, last_index, &cs);
	fdput(f);

	if (copy_to_user(cstat, &cs, sizeof(struct cachestat)))
		return -EFAULT;

	return 0;
}
#endif /* CONFIG_CACHESTAT_SYSCALL */