Commit 1251704a authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "15 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm, docs: update memory.stat description with workingset* entries
  mm: vmscan: scan until it finds eligible pages
  mm, thp: copying user pages must schedule on collapse
  dax: fix PMD data corruption when fault races with write
  dax: fix data corruption when fault races with write
  ext4: return to starting transaction in ext4_dax_huge_fault()
  mm: fix data corruption due to stale mmap reads
  dax: prevent invalidation of mapped DAX entries
  Tigran has moved
  mm, vmalloc: fix vmalloc users tracking properly
  mm/khugepaged: add missed tracepoint for collapse_huge_page_swapin
  gcov: support GCC 7.1
  mm, vmstat: Remove spurious WARN() during zoneinfo print
  time: delete current_fs_time()
  hwpoison, memcg: forcibly uncharge LRU pages
parents 0fcc3ab2 b340959e
......@@ -918,6 +918,18 @@ PAGE_SIZE multiple when read back.
Number of major page faults incurred
workingset_refault
Number of refaults of previously evicted pages
workingset_activate
Number of refaulted pages that were immediately activated
workingset_nodereclaim
Number of times a shadow node has been reclaimed
memory.swap.current
A read-only single value file which exists on non-root
......
......@@ -54,4 +54,4 @@ The first 4 bytes should be 0x1badface.
If you have any patches, questions or suggestions regarding this BFS
implementation please contact the author:
Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
Tigran Aivazian <aivazian.tigran@gmail.com>
......@@ -2483,7 +2483,7 @@ S: Maintained
F: drivers/net/ethernet/ec_bhf.c
BFS FILE SYSTEM
M: "Tigran A. Aivazian" <tigran@aivazian.fsnet.co.uk>
M: "Tigran A. Aivazian" <aivazian.tigran@gmail.com>
S: Maintained
F: Documentation/filesystems/bfs.txt
F: fs/bfs/
......
......@@ -10,7 +10,7 @@
* Author: Peter Oruba <peter.oruba@amd.com>
*
* Based on work by:
* Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* Tigran Aivazian <aivazian.tigran@gmail.com>
*
* early loader:
* Copyright (C) 2013 Advanced Micro Devices, Inc.
......
/*
* CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* 2006 Shaohua Li <shaohua.li@intel.com>
* 2013-2016 Borislav Petkov <bp@alien8.de>
*
......
/*
* Intel CPU Microcode Update Driver for Linux
*
* Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* Copyright (C) 2000-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* 2006 Shaohua Li <shaohua.li@intel.com>
*
* Intel CPU microcode early update for Linux
......
/*
* fs/bfs/inode.c
* BFS superblock and inode operations.
* Copyright (C) 1999-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
* Copyright (C) 1999-2006 Tigran Aivazian <aivazian.tigran@gmail.com>
* From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds.
*
* Made endianness-clean by Andrew Stribblehill <ads@wompom.org>, 2005.
......@@ -19,7 +19,7 @@
#include <linux/uaccess.h>
#include "bfs.h"
MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>");
MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux");
MODULE_LICENSE("GPL");
......
......@@ -460,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
return ret;
}
/*
* Invalidate exceptional DAX entry if easily possible. This handles DAX
* entries for invalidate_inode_pages() so we evict the entry only if we can
* do so without blocking.
*/
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
{
int ret = 0;
void *entry, **slot;
struct radix_tree_root *page_tree = &mapping->page_tree;
spin_lock_irq(&mapping->tree_lock);
entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
if (!entry || !radix_tree_exceptional_entry(entry) ||
slot_locked(mapping, slot))
goto out;
if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
goto out;
radix_tree_delete(page_tree, index);
mapping->nrexceptional--;
ret = 1;
out:
spin_unlock_irq(&mapping->tree_lock);
if (ret)
dax_wake_mapping_entry_waiter(mapping, index, entry, true);
return ret;
}
/*
* Invalidate exceptional DAX entry if it is clean.
*/
......@@ -1044,7 +1015,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
* into page tables. We have to tear down these mappings so that data
* written by write(2) is visible in mmap.
*/
if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
if (iomap->flags & IOMAP_F_NEW) {
invalidate_inode_pages2_range(inode->i_mapping,
pos >> PAGE_SHIFT,
(end - 1) >> PAGE_SHIFT);
......@@ -1177,6 +1148,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
flags |= IOMAP_WRITE;
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
vmf_ret = dax_fault_return(PTR_ERR(entry));
goto out;
}
/*
* Note that we don't bother to use iomap_apply here: DAX required
* the file system block size to be equal the page size, which means
......@@ -1185,17 +1162,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
if (error) {
vmf_ret = dax_fault_return(error);
goto out;
goto unlock_entry;
}
if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
goto finish_iomap;
}
entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
if (IS_ERR(entry)) {
vmf_ret = dax_fault_return(PTR_ERR(entry));
goto finish_iomap;
error = -EIO; /* fs corruption? */
goto error_finish_iomap;
}
sector = dax_iomap_sector(&iomap, pos);
......@@ -1217,13 +1188,13 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
}
if (error)
goto error_unlock_entry;
goto error_finish_iomap;
__SetPageUptodate(vmf->cow_page);
vmf_ret = finish_fault(vmf);
if (!vmf_ret)
vmf_ret = VM_FAULT_DONE_COW;
goto unlock_entry;
goto finish_iomap;
}
switch (iomap.type) {
......@@ -1243,7 +1214,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) {
vmf_ret = dax_load_hole(mapping, &entry, vmf);
goto unlock_entry;
goto finish_iomap;
}
/*FALLTHRU*/
default:
......@@ -1252,10 +1223,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
break;
}
error_unlock_entry:
error_finish_iomap:
vmf_ret = dax_fault_return(error) | major;
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PAGE_SIZE;
......@@ -1270,7 +1239,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
*/
ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
}
out:
unlock_entry:
put_locked_mapping_entry(mapping, vmf->pgoff, entry);
out:
trace_dax_pte_fault_done(inode, vmf, vmf_ret);
return vmf_ret;
}
......@@ -1416,6 +1387,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
goto fallback;
/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
goto fallback;
/*
* Note that we don't use iomap_apply here. We aren't doing I/O, only
* setting up a mapping, so really we're using iomap_begin() as a way
......@@ -1424,21 +1405,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
pos = (loff_t)pgoff << PAGE_SHIFT;
error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
if (error)
goto fallback;
goto unlock_entry;
if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap;
/*
* grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
* PMD or a HZP entry. If it can't (because a 4k page is already in
* the tree, for instance), it will return -EEXIST and we just fall
* back to 4k entries.
*/
entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
if (IS_ERR(entry))
goto finish_iomap;
switch (iomap.type) {
case IOMAP_MAPPED:
result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry);
......@@ -1446,7 +1417,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
case IOMAP_UNWRITTEN:
case IOMAP_HOLE:
if (WARN_ON_ONCE(write))
goto unlock_entry;
break;
result = dax_pmd_load_hole(vmf, &iomap, &entry);
break;
default:
......@@ -1454,8 +1425,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
break;
}
unlock_entry:
put_locked_mapping_entry(mapping, pgoff, entry);
finish_iomap:
if (ops->iomap_end) {
int copied = PMD_SIZE;
......@@ -1471,6 +1440,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
&iomap);
}
unlock_entry:
put_locked_mapping_entry(mapping, pgoff, entry);
fallback:
if (result == VM_FAULT_FALLBACK) {
split_huge_pmd(vma, vmf->pmd, vmf->address);
......
......@@ -257,6 +257,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
enum page_entry_size pe_size)
{
int result;
handle_t *handle = NULL;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct super_block *sb = inode->i_sb;
bool write = vmf->flags & FAULT_FLAG_WRITE;
......@@ -264,12 +265,24 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
if (write) {
sb_start_pagefault(sb);
file_update_time(vmf->vma->vm_file);
}
down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb));
} else {
down_read(&EXT4_I(inode)->i_mmap_sem);
}
if (!IS_ERR(handle))
result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
else
result = VM_FAULT_SIGBUS;
if (write) {
if (!IS_ERR(handle))
ext4_journal_stop(handle);
up_read(&EXT4_I(inode)->i_mmap_sem);
if (write)
sb_end_pagefault(sb);
} else {
up_read(&EXT4_I(inode)->i_mmap_sem);
}
return result;
}
......
......@@ -89,7 +89,6 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
const struct iomap_ops *ops);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
......
......@@ -1431,7 +1431,6 @@ static inline void i_gid_write(struct inode *inode, gid_t gid)
inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid);
}
extern struct timespec current_fs_time(struct super_block *sb);
extern struct timespec current_time(struct inode *inode);
/*
......
......@@ -6,7 +6,6 @@
#include <linux/list.h>
#include <linux/llist.h>
#include <asm/page.h> /* pgprot_t */
#include <asm/pgtable.h> /* PAGE_KERNEL */
#include <linux/rbtree.h>
struct vm_area_struct; /* vma defining user mapping in mm_types.h */
......@@ -83,22 +82,14 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
const void *caller);
#ifndef CONFIG_MMU
extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
#else
extern void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller);
/*
* We really want to have this inlined due to caller tracking. This
* function is used by the highlevel vmalloc apis and so we want to track
* their callers and inlining will achieve that.
*/
static inline void *__vmalloc_node_flags(unsigned long size,
int node, gfp_t flags)
static inline void *__vmalloc_node_flags_caller(unsigned long size, int node,
gfp_t flags, void *caller)
{
return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
node, __builtin_return_address(0));
return __vmalloc_node_flags(size, node, flags);
}
#else
extern void *__vmalloc_node_flags_caller(unsigned long size,
int node, gfp_t flags, void *caller);
#endif
extern void vfree(const void *addr);
......
......@@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_icall_topn);
void __gcov_exit(void)
{
/* Unused. */
}
EXPORT_SYMBOL(__gcov_exit);
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
......
......@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#if (__GNUC__ >= 7)
#define GCOV_COUNTERS 9
#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
......
......@@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p)
return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
}
/**
* current_fs_time - Return FS time
* @sb: Superblock.
*
* Return the current time truncated to the time granularity supported by
* the fs.
*/
struct timespec current_fs_time(struct super_block *sb)
{
struct timespec now = current_kernel_time();
return timespec_trunc(now, sb->s_time_gran);
}
EXPORT_SYMBOL(current_fs_time);
/*
* Convert jiffies to milliseconds and back.
*
......
......@@ -612,7 +612,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
spinlock_t *ptl)
{
pte_t *_pte;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) {
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
struct page *src_page;
......@@ -651,9 +652,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
}
address += PAGE_SIZE;
page++;
cond_resched();
}
}
......@@ -907,9 +906,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
return false;
}
/* check if the pmd is still valid */
if (mm_find_pmd(mm, address) != pmd)
if (mm_find_pmd(mm, address) != pmd) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
}
}
if (ret & VM_FAULT_ERROR) {
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
return false;
......
......@@ -5528,7 +5528,7 @@ static void uncharge_list(struct list_head *page_list)
next = page->lru.next;
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page);
if (!page->mem_cgroup)
continue;
......
......@@ -539,6 +539,13 @@ static int delete_from_lru_cache(struct page *p)
*/
ClearPageActive(p);
ClearPageUnevictable(p);
/*
* Poisoned page might never drop its ref count to 0 so we have
* to uncharge it manually from its memcg.
*/
mem_cgroup_uncharge(p);
/*
* drop the page count elevated by isolate_lru_page()
*/
......
......@@ -67,17 +67,14 @@ static void truncate_exceptional_entry(struct address_space *mapping,
/*
* Invalidate exceptional entry if easily possible. This handles exceptional
* entries for invalidate_inode_pages() so for DAX it evicts only unlocked and
* clean entries.
* entries for invalidate_inode_pages().
*/
static int invalidate_exceptional_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
/* Handled by shmem itself */
if (shmem_mapping(mapping))
/* Handled by shmem itself, or for DAX we do nothing. */
if (shmem_mapping(mapping) || dax_mapping(mapping))
return 1;
if (dax_mapping(mapping))
return dax_invalidate_mapping_entry(mapping, index);
clear_shadow_entry(mapping, index, entry);
return 1;
}
......@@ -689,7 +686,17 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
cond_resched();
index++;
}
/*
* For DAX we invalidate page tables after invalidating radix tree. We
* could invalidate page tables while invalidating each entry however
* that would be expensive. And doing range unmapping before doesn't
* work as we have no cheap way to find whether radix tree entry didn't
* get remapped later.
*/
if (dax_mapping(mapping)) {
unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT,
(loff_t)(end - start + 1) << PAGE_SHIFT, 0);
}
out:
cleancache_invalidate_inode(mapping);
return ret;
......
......@@ -382,7 +382,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
return __vmalloc_node_flags(size, node, flags);
return __vmalloc_node_flags_caller(size, node, flags,
__builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
......
......@@ -1649,6 +1649,9 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
......@@ -1791,7 +1794,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
* with mm people.
*
*/
void *__vmalloc_node(unsigned long size, unsigned long align,
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller)
{
......@@ -1806,6 +1809,20 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
static inline void *__vmalloc_node_flags(unsigned long size,
int node, gfp_t flags)
{
return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
node, __builtin_return_address(0));
}
void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
void *caller)
{
return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
}
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
......
......@@ -1449,7 +1449,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
*
* Appropriate locks must be held before calling this function.
*
* @nr_to_scan: The number of pages to look through on the list.
* @nr_to_scan: The number of eligible pages to look through on the list.
* @lruvec: The LRU vector to pull pages from.
* @dst: The temp list to put pages on to.
* @nr_scanned: The number of pages that were scanned.
......@@ -1469,11 +1469,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
unsigned long skipped = 0;
unsigned long scan, nr_pages;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
!list_empty(src); scan++) {
scan = 0;
for (total_scan = 0;
scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
total_scan++) {
struct page *page;
page = lru_to_page(src);
......@@ -1487,6 +1489,13 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
continue;
}
/*
* Do not count skipped pages because that makes the function
* return with no isolated pages if the LRU mostly contains
* ineligible pages. This causes the VM to not reclaim any
* pages, triggering a premature OOM.
*/
scan++;
switch (__isolate_lru_page(page, mode)) {
case 0:
nr_pages = hpage_nr_pages(page);
......@@ -1524,9 +1533,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
skipped += nr_skipped[zid];
}
}
*nr_scanned = scan;
*nr_scanned = total_scan;
trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
scan, skipped, nr_taken, mode, lru);
total_scan, skipped, nr_taken, mode, lru);
update_lru_sizes(lruvec, lru, nr_zone_taken);
return nr_taken;
}
......
......@@ -1359,8 +1359,6 @@ static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone)
return zone == compare;
}
/* The zone must be somewhere! */
WARN_ON_ONCE(1);
return false;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment