/*
 *  linux/mm/swap_state.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *  Swap reorganised 29.12.95, Stephen Tweedie
 *
 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
 */

#include <linux/mm.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/swapctl.h>
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>

#include <asm/pgtable.h>

/*
 * We may have stale swap cache pages in memory: notice
 * them here and get rid of the unnecessary final write.
 */
static int swap_writepage(struct page *page)
{
	if (remove_exclusive_swap_page(page)) {
		unlock_page(page);
		return 0;
	}
	rw_swap_page(WRITE, page);
	return 0;
}

/*
 * swapper_space doesn't have a real inode, so it gets a special vm_writeback()
 * so we don't need swap special cases in generic_vm_writeback().
 *
 * FIXME: swap pages are locked, but not PageWriteback while under writeout.
 * This will confuse throttling in shrink_cache().  It may be advantageous to
 * set PG_writeback against swap pages while they're also locked.  Either that,
 * or special-case swap pages in shrink_cache().
 */
static int swap_vm_writeback(struct page *page, int *nr_to_write)
{
	struct address_space *mapping = page->mapping;

	unlock_page(page);
	return generic_writeback_mapping(mapping, nr_to_write);
}

static struct address_space_operations swap_aops = {
	vm_writeback: swap_vm_writeback,
	writepage: swap_writepage,
	sync_page: block_sync_page,
};

/*
 * swapper_inode doesn't do anything much.  It is really only here to
 * avoid some special-casing in other parts of the kernel.
 */
static struct inode swapper_inode = {
	i_mapping:		&swapper_space,
};

struct address_space swapper_space = {
	page_tree:	RADIX_TREE_INIT(GFP_ATOMIC),
	page_lock:	RW_LOCK_UNLOCKED,
	clean_pages:	LIST_HEAD_INIT(swapper_space.clean_pages),
	dirty_pages:	LIST_HEAD_INIT(swapper_space.dirty_pages),
	io_pages:	LIST_HEAD_INIT(swapper_space.io_pages),
	locked_pages:	LIST_HEAD_INIT(swapper_space.locked_pages),
	host:		&swapper_inode,
	a_ops:		&swap_aops,
	i_shared_lock:	SPIN_LOCK_UNLOCKED,
	private_lock:	SPIN_LOCK_UNLOCKED,
	private_list:	LIST_HEAD_INIT(swapper_space.private_list),
};

#ifdef SWAP_CACHE_INFO
#define INC_CACHE_INFO(x)	(swap_cache_info.x++)

static struct {
	unsigned long add_total;
	unsigned long del_total;
	unsigned long find_success;
	unsigned long find_total;
	unsigned long noent_race;
	unsigned long exist_race;
} swap_cache_info;

void show_swap_cache_info(void)
{
	printk("Swap cache: add %lu, delete %lu, find %lu/%lu, race %lu+%lu\n",
		swap_cache_info.add_total, swap_cache_info.del_total,
		swap_cache_info.find_success, swap_cache_info.find_total,
		swap_cache_info.noent_race, swap_cache_info.exist_race);
}
#else
#define INC_CACHE_INFO(x)	do { } while (0)
#endif

int add_to_swap_cache(struct page *page, swp_entry_t entry)
{
	int error;

	if (page->mapping)
		BUG();
	if (!swap_duplicate(entry)) {
		INC_CACHE_INFO(noent_race);
		return -ENOENT;
	}

	error = add_to_page_cache_unique(page, &swapper_space, entry.val);
	if (error != 0) {
		swap_free(entry);
		if (error == -EEXIST)
			INC_CACHE_INFO(exist_race);
		return error;
	}
	if (!PageLocked(page))
		BUG();
	if (!PageSwapCache(page))
		BUG();
	INC_CACHE_INFO(add_total);
	return 0;
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache.
 */
void __delete_from_swap_cache(struct page *page)
{
	if (!PageLocked(page))
		BUG();
	if (!PageSwapCache(page))
		BUG();
	ClearPageDirty(page);
	__remove_inode_page(page);
	INC_CACHE_INFO(del_total);
}

/*
 * This must be called only on pages that have
 * been verified to be in the swap cache and locked.
 * It will never put the page into the free list,
 * the caller has a reference on the page.
 */
void delete_from_swap_cache(struct page *page)
{
	swp_entry_t entry;

	if (!PageLocked(page))
		BUG();

	block_flushpage(page, 0);

	entry.val = page->index;

	write_lock(&swapper_space.page_lock);
	__delete_from_swap_cache(page);
	write_unlock(&swapper_space.page_lock);

	swap_free(entry);
	page_cache_release(page);
}

int move_to_swap_cache(struct page *page, swp_entry_t entry)
{
	struct address_space *mapping = page->mapping;
	void **pslot;
	int err;

	if (!mapping)
		BUG();

	if (!swap_duplicate(entry)) {
		INC_CACHE_INFO(noent_race);
		return -ENOENT;
	}

	write_lock(&swapper_space.page_lock);
	write_lock(&mapping->page_lock);

	err = radix_tree_reserve(&swapper_space.page_tree, entry.val, &pslot);
	if (!err) {
		/* Remove it from the page cache */
		__remove_inode_page (page);

		/* Add it to the swap cache */
		*pslot = page;
		page->flags &= ~(1 << PG_uptodate | 1 << PG_error
				| 1 << PG_referenced | 1 << PG_arch_1
				| 1 << PG_checked);
		SetPageLocked(page);
		ClearPageDirty(page);
		___add_to_page_cache(page, &swapper_space, entry.val);
	}

	write_unlock(&mapping->page_lock);
	write_unlock(&swapper_space.page_lock);

	if (!err) {
		INC_CACHE_INFO(add_total);
		return 0;
	}

	swap_free(entry);

	if (err == -EEXIST)
		INC_CACHE_INFO(exist_race);

	return err;
}

int move_from_swap_cache(struct page *page, unsigned long index,
		struct address_space *mapping)
{
	void **pslot;
	int err;

	if (!PageLocked(page))
		BUG();

	write_lock(&swapper_space.page_lock);
	write_lock(&mapping->page_lock);

	err = radix_tree_reserve(&mapping->page_tree, index, &pslot);
	if (!err) {
		swp_entry_t entry;

		block_flushpage(page, 0);
		entry.val = page->index;
		__delete_from_swap_cache(page);
		swap_free(entry);

		*pslot = page;
		page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
				 1 << PG_referenced | 1 << PG_arch_1 |
				 1 << PG_checked);
		/*
		 * ___add_to_page_cache puts the page on ->clean_pages,
		 * but it's dirty.  If it's on ->clean_pages, it will basically
		 * never get written out.
		 */
		SetPageDirty(page);
		___add_to_page_cache(page, mapping, index);
		/* fix that up */
		list_del(&page->list);
		list_add(&page->list, &mapping->dirty_pages);
	}

	write_unlock(&mapping->page_lock);
	write_unlock(&swapper_space.page_lock);

	return err;
}

/* 
 * Perform a free_page(), also freeing any swap cache associated with
 * this page if it is the last user of the page. Can not do a lock_page,
 * as we are holding the page_table_lock spinlock.
 */
void free_page_and_swap_cache(struct page *page)
{
	/* 
	 * If we are the only user, then try to free up the swap cache. 
	 * 
	 * Its ok to check for PageSwapCache without the page lock
	 * here because we are going to recheck again inside 
	 * exclusive_swap_page() _with_ the lock. 
	 * 					- Marcelo
	 */
	if (PageSwapCache(page) && !TestSetPageLocked(page)) {
		remove_exclusive_swap_page(page);
		unlock_page(page);
	}
	page_cache_release(page);
}

/*
 * Lookup a swap entry in the swap cache. A found page will be returned
 * unlocked and with its refcount incremented - we rely on the kernel
 * lock getting page table operations atomic even if we drop the page
 * lock before returning.
 */
struct page * lookup_swap_cache(swp_entry_t entry)
{
	struct page *found;

	found = find_get_page(&swapper_space, entry.val);
	/*
	 * Unsafe to assert PageSwapCache and mapping on page found:
	 * if SMP nothing prevents swapoff from deleting this page from
	 * the swap cache at this moment.  find_lock_page would prevent
	 * that, but no need to change: we _have_ got the right page.
	 */
	INC_CACHE_INFO(find_total);
	if (found)
		INC_CACHE_INFO(find_success);
	return found;
}

/* 
 * Locate a page of swap in physical memory, reserving swap cache space
 * and reading the disk if it is not already cached.
 * A failure return means that either the page allocation failed or that
 * the swap entry is no longer in use.
 */
struct page * read_swap_cache_async(swp_entry_t entry)
{
	struct page *found_page, *new_page = NULL;
	int err;

	do {
		/*
		 * First check the swap cache.  Since this is normally
		 * called after lookup_swap_cache() failed, re-calling
		 * that would confuse statistics: use find_get_page()
		 * directly.
		 */
		found_page = find_get_page(&swapper_space, entry.val);
		if (found_page)
			break;

		/*
		 * Get a new page to read into from swap.
		 */
		if (!new_page) {
			new_page = alloc_page(GFP_HIGHUSER);
			if (!new_page)
				break;		/* Out of memory */
		}

		/*
		 * Associate the page with swap entry in the swap cache.
		 * May fail (-ENOENT) if swap entry has been freed since
		 * our caller observed it.  May fail (-EEXIST) if there
		 * is already a page associated with this entry in the
		 * swap cache: added by a racing read_swap_cache_async,
		 * or by try_to_swap_out (or shmem_writepage) re-using
		 * the just freed swap entry for an existing page.
		 * May fail (-ENOMEM) if radix-tree node allocation failed.
		 */
		err = add_to_swap_cache(new_page, entry);
		if (!err) {
			/*
			 * Initiate read into locked page and return.
			 */
			rw_swap_page(READ, new_page);
			return new_page;
		}
	} while (err != -ENOENT && err != -ENOMEM);

	if (new_page)
		page_cache_release(new_page);
	return found_page;
}