Commit d9104d1c authored by Cyrill Gorcunov's avatar Cyrill Gorcunov Committed by Linus Torvalds

mm: track vma changes with VM_SOFTDIRTY bit

Pavel reported that in case if vma area get unmapped and then mapped (or
expanded) in-place, the soft dirty tracker won't be able to recognize this
situation since it works on pte level and ptes are get zapped on unmap,
loosing soft dirty bit of course.

So to resolve this situation we need to track actions on vma level, there
VM_SOFTDIRTY flag comes in.  When new vma area created (or old expanded)
we set this bit, and keep it here until application calls for clearing
soft dirty bit.

Thus when user space application track memory changes now it can detect if
vma area is renewed.
Reported-by: default avatarPavel Emelyanov <xemul@parallels.com>
Signed-off-by: default avatarCyrill Gorcunov <gorcunov@openvz.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Cc: Rob Landley <rob@landley.net>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3b11f0aa
...@@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all ...@@ -28,6 +28,13 @@ This is so, since the pages are still mapped to physical memory, and thus all
the kernel does is finds this fact out and puts both writable and soft-dirty the kernel does is finds this fact out and puts both writable and soft-dirty
bits on the PTE. bits on the PTE.
While in most cases tracking memory changes by #PF-s is more than enough
there is still a scenario when we can lose soft dirty bits -- a task
unmaps a previously mapped memory region and then maps a new one at exactly
the same place. When unmap is called, the kernel internally clears PTE values
including soft dirty bits. To notify user space application about such
memory region renewal the kernel always marks new memory regions (and
expanded regions) as soft dirty.
This feature is actively used by the checkpoint-restore project. You This feature is actively used by the checkpoint-restore project. You
can find more details about it on http://criu.org can find more details about it on http://criu.org
......
...@@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm) ...@@ -266,7 +266,7 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
vma->vm_end = STACK_TOP_MAX; vma->vm_end = STACK_TOP_MAX;
vma->vm_start = vma->vm_end - PAGE_SIZE; vma->vm_start = vma->vm_end - PAGE_SIZE;
vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
INIT_LIST_HEAD(&vma->anon_vma_chain); INIT_LIST_HEAD(&vma->anon_vma_chain);
......
...@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ...@@ -740,6 +740,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
ptent = pte_file_clear_soft_dirty(ptent); ptent = pte_file_clear_soft_dirty(ptent);
} }
if (vma->vm_flags & VM_SOFTDIRTY)
vma->vm_flags &= ~VM_SOFTDIRTY;
set_pte_at(vma->vm_mm, addr, pte, ptent); set_pte_at(vma->vm_mm, addr, pte, ptent);
#endif #endif
} }
...@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, ...@@ -949,13 +952,15 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
if (is_migration_entry(entry)) if (is_migration_entry(entry))
page = migration_entry_to_page(entry); page = migration_entry_to_page(entry);
} else { } else {
*pme = make_pme(PM_NOT_PRESENT(pm->v2)); if (vma->vm_flags & VM_SOFTDIRTY)
flags2 |= __PM_SOFT_DIRTY;
*pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
return; return;
} }
if (page && !PageAnon(page)) if (page && !PageAnon(page))
flags |= PM_FILE; flags |= PM_FILE;
if (pte_soft_dirty(pte)) if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte))
flags2 |= __PM_SOFT_DIRTY; flags2 |= __PM_SOFT_DIRTY;
*pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags);
...@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p ...@@ -974,7 +979,7 @@ static void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *p
*pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset) *pme = make_pme(PM_PFRAME(pmd_pfn(pmd) + offset)
| PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT); | PM_STATUS2(pm->v2, pmd_flags2) | PM_PRESENT);
else else
*pme = make_pme(PM_NOT_PRESENT(pm->v2)); *pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, pmd_flags2));
} }
#else #else
static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
...@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ...@@ -997,7 +1002,11 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { if (vma && pmd_trans_huge_lock(pmd, vma) == 1) {
int pmd_flags2; int pmd_flags2;
pmd_flags2 = (pmd_soft_dirty(*pmd) ? __PM_SOFT_DIRTY : 0); if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
pmd_flags2 = __PM_SOFT_DIRTY;
else
pmd_flags2 = 0;
for (; addr != end; addr += PAGE_SIZE) { for (; addr != end; addr += PAGE_SIZE) {
unsigned long offset; unsigned long offset;
...@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ...@@ -1015,12 +1024,17 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (pmd_trans_unstable(pmd)) if (pmd_trans_unstable(pmd))
return 0; return 0;
for (; addr != end; addr += PAGE_SIZE) { for (; addr != end; addr += PAGE_SIZE) {
int flags2;
/* check to see if we've left 'vma' behind /* check to see if we've left 'vma' behind
* and need a new, higher one */ * and need a new, higher one */
if (vma && (addr >= vma->vm_end)) { if (vma && (addr >= vma->vm_end)) {
vma = find_vma(walk->mm, addr); vma = find_vma(walk->mm, addr);
pme = make_pme(PM_NOT_PRESENT(pm->v2)); if (vma && (vma->vm_flags & VM_SOFTDIRTY))
flags2 = __PM_SOFT_DIRTY;
else
flags2 = 0;
pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2));
} }
/* check that 'vma' actually covers this address, /* check that 'vma' actually covers this address,
...@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ...@@ -1044,13 +1058,15 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
#ifdef CONFIG_HUGETLB_PAGE #ifdef CONFIG_HUGETLB_PAGE
static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, static void huge_pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm,
pte_t pte, int offset) pte_t pte, int offset, int flags2)
{ {
if (pte_present(pte)) if (pte_present(pte))
*pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) *pme = make_pme(PM_PFRAME(pte_pfn(pte) + offset) |
| PM_STATUS2(pm->v2, 0) | PM_PRESENT); PM_STATUS2(pm->v2, flags2) |
PM_PRESENT);
else else
*pme = make_pme(PM_NOT_PRESENT(pm->v2)); *pme = make_pme(PM_NOT_PRESENT(pm->v2) |
PM_STATUS2(pm->v2, flags2));
} }
/* This function walks within one hugetlb entry in the single call */ /* This function walks within one hugetlb entry in the single call */
...@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, ...@@ -1059,12 +1075,22 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
struct mm_walk *walk) struct mm_walk *walk)
{ {
struct pagemapread *pm = walk->private; struct pagemapread *pm = walk->private;
struct vm_area_struct *vma;
int err = 0; int err = 0;
int flags2;
pagemap_entry_t pme; pagemap_entry_t pme;
vma = find_vma(walk->mm, addr);
WARN_ON_ONCE(!vma);
if (vma && (vma->vm_flags & VM_SOFTDIRTY))
flags2 = __PM_SOFT_DIRTY;
else
flags2 = 0;
for (; addr != end; addr += PAGE_SIZE) { for (; addr != end; addr += PAGE_SIZE) {
int offset = (addr & ~hmask) >> PAGE_SHIFT; int offset = (addr & ~hmask) >> PAGE_SHIFT;
huge_pte_to_pagemap_entry(&pme, pm, *pte, offset); huge_pte_to_pagemap_entry(&pme, pm, *pte, offset, flags2);
err = add_to_pagemap(addr, &pme, pm); err = add_to_pagemap(addr, &pme, pm);
if (err) if (err)
return err; return err;
......
...@@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp); ...@@ -115,6 +115,12 @@ extern unsigned int kobjsize(const void *objp);
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
#ifdef CONFIG_MEM_SOFT_DIRTY
# define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */
#else
# define VM_SOFTDIRTY 0
#endif
#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
#define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */
#define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */
......
...@@ -1609,6 +1609,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, ...@@ -1609,6 +1609,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (file) if (file)
uprobe_mmap(vma); uprobe_mmap(vma);
/*
* New (or expanded) vma always get soft dirty status.
* Otherwise user-space soft-dirty page tracker won't
* be able to distinguish situation when vma area unmapped,
* then new mapped in-place (which must be aimed as
* a completely new data area).
*/
vma->vm_flags |= VM_SOFTDIRTY;
return addr; return addr;
unmap_and_free_vma: unmap_and_free_vma:
...@@ -2652,6 +2661,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len) ...@@ -2652,6 +2661,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
mm->total_vm += len >> PAGE_SHIFT; mm->total_vm += len >> PAGE_SHIFT;
if (flags & VM_LOCKED) if (flags & VM_LOCKED)
mm->locked_vm += (len >> PAGE_SHIFT); mm->locked_vm += (len >> PAGE_SHIFT);
vma->vm_flags |= VM_SOFTDIRTY;
return addr; return addr;
} }
...@@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm, ...@@ -2916,7 +2926,7 @@ int install_special_mapping(struct mm_struct *mm,
vma->vm_start = addr; vma->vm_start = addr;
vma->vm_end = addr + len; vma->vm_end = addr + len;
vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND; vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
vma->vm_ops = &special_mapping_vmops; vma->vm_ops = &special_mapping_vmops;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment