Commit 77f9b272 authored by Tobias Klauser's avatar Tobias Klauser Committed by Tobias Klauser

runtime: use MADV_FREE on Linux if available

On Linux, sysUnused currently uses madvise(MADV_DONTNEED) to signal the
kernel that a range of allocated memory contains unneeded data. After a
successful call, the range (but not the data it contained before the
call to madvise) is still available but the first access to that range
will unconditionally incur a page fault (needed to 0-fill the range).

A faster alternative is MADV_FREE, available since Linux 4.5. The
mechanism is very similar, but the page fault will only be incurred if
the kernel, between the call to madvise and the first access, decides to
reuse that memory for something else.

In sysUnused, test whether MADV_FREE is supported and fall back to
MADV_DONTNEED in case it isn't. This requires making the return value of
the madvise syscall available to the caller, so change runtime.madvise
to return it.

Fixes #23687

Change-Id: I962c3429000dd9f4a00846461ad128b71201bb04
Reviewed-on: https://go-review.googlesource.com/135395
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
parent a0f5d5f8
...@@ -59,6 +59,9 @@ const ( ...@@ -59,6 +59,9 @@ const (
MAP_FIXED = C.MAP_FIXED MAP_FIXED = C.MAP_FIXED
MADV_DONTNEED = C.MADV_DONTNEED MADV_DONTNEED = C.MADV_DONTNEED
MADV_FREE = C.MADV_FREE
MADV_HUGEPAGE = C.MADV_HUGEPAGE
MADV_NOHUGEPAGE = C.MADV_HNOUGEPAGE
SA_RESTART = C.SA_RESTART SA_RESTART = C.SA_RESTART
SA_ONSTACK = C.SA_ONSTACK SA_ONSTACK = C.SA_ONSTACK
......
...@@ -48,6 +48,9 @@ const ( ...@@ -48,6 +48,9 @@ const (
MAP_FIXED = C.MAP_FIXED MAP_FIXED = C.MAP_FIXED
MADV_DONTNEED = C.MADV_DONTNEED MADV_DONTNEED = C.MADV_DONTNEED
MADV_FREE = C.MADV_FREE
MADV_HUGEPAGE = C.MADV_HUGEPAGE
MADV_NOHUGEPAGE = C.MADV_HNOUGEPAGE
SA_RESTART = C.SA_RESTART SA_RESTART = C.SA_RESTART
SA_ONSTACK = C.SA_ONSTACK SA_ONSTACK = C.SA_ONSTACK
......
...@@ -18,6 +18,7 @@ const ( ...@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -18,6 +18,7 @@ const ( ...@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -16,6 +16,7 @@ const ( ...@@ -16,6 +16,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -18,6 +18,7 @@ const ( ...@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -18,6 +18,7 @@ const ( ...@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -22,6 +22,7 @@ const ( ...@@ -22,6 +22,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -18,6 +18,7 @@ const ( ...@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -18,6 +18,7 @@ const ( ...@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -19,6 +19,7 @@ const ( ...@@ -19,6 +19,7 @@ const (
_MAP_FIXED = 0x10 _MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4 _MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe _MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf _MADV_NOHUGEPAGE = 0xf
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
package runtime package runtime
import ( import (
"runtime/internal/atomic"
"runtime/internal/sys" "runtime/internal/sys"
"unsafe" "unsafe"
) )
...@@ -34,10 +35,12 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer { ...@@ -34,10 +35,12 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
return p return p
} }
var adviseUnused = uint32(_MADV_FREE)
func sysUnused(v unsafe.Pointer, n uintptr) { func sysUnused(v unsafe.Pointer, n uintptr) {
// By default, Linux's "transparent huge page" support will // By default, Linux's "transparent huge page" support will
// merge pages into a huge page if there's even a single // merge pages into a huge page if there's even a single
// present regular page, undoing the effects of the DONTNEED // present regular page, undoing the effects of madvise(adviseUnused)
// below. On amd64, that means khugepaged can turn a single // below. On amd64, that means khugepaged can turn a single
// 4KB page to 2MB, bloating the process's RSS by as much as // 4KB page to 2MB, bloating the process's RSS by as much as
// 512X. (See issue #8832 and Linux kernel bug // 512X. (See issue #8832 and Linux kernel bug
...@@ -102,7 +105,13 @@ func sysUnused(v unsafe.Pointer, n uintptr) { ...@@ -102,7 +105,13 @@ func sysUnused(v unsafe.Pointer, n uintptr) {
throw("unaligned sysUnused") throw("unaligned sysUnused")
} }
advise := atomic.Load(&adviseUnused)
if errno := madvise(v, n, int32(advise)); advise == _MADV_FREE && errno != 0 {
// MADV_FREE was added in Linux 4.5. Fall back to MADV_DONTNEED if it is
// not supported.
atomic.Store(&adviseUnused, _MADV_DONTNEED)
madvise(v, n, _MADV_DONTNEED) madvise(v, n, _MADV_DONTNEED)
}
} }
func sysUsed(v unsafe.Pointer, n uintptr) { func sysUsed(v unsafe.Pointer, n uintptr) {
......
...@@ -25,7 +25,8 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32 ...@@ -25,7 +25,8 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32
//go:noescape //go:noescape
func open(name *byte, mode, perm int32) int32 func open(name *byte, mode, perm int32) int32
func madvise(addr unsafe.Pointer, n uintptr, flags int32) // return value is only set on linux to be used in osinit()
func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
// exitThread terminates the current thread, writing *wait = 0 when // exitThread terminates the current thread, writing *wait = 0 when
// the stack is safe to reclaim. // the stack is safe to reclaim.
......
...@@ -260,7 +260,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -260,7 +260,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX MOVL flags+16(FP), DX
MOVQ $75, AX // madvise MOVQ $75, AX // madvise
SYSCALL SYSCALL
// ignore failure - maybe pages are locked JCC 2(PC)
MOVL $-1, AX
MOVL AX, ret+24(FP)
RET RET
TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
......
...@@ -163,7 +163,9 @@ TEXT runtime·munmap(SB),NOSPLIT,$-4 ...@@ -163,7 +163,9 @@ TEXT runtime·munmap(SB),NOSPLIT,$-4
TEXT runtime·madvise(SB),NOSPLIT,$-4 TEXT runtime·madvise(SB),NOSPLIT,$-4
MOVL $75, AX // madvise MOVL $75, AX // madvise
INT $0x80 INT $0x80
// ignore failure - maybe pages are locked JAE 2(PC)
MOVL $-1, AX
MOVL AX, ret+12(FP)
RET RET
TEXT runtime·setitimer(SB), NOSPLIT, $-4 TEXT runtime·setitimer(SB), NOSPLIT, $-4
......
...@@ -337,7 +337,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -337,7 +337,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX MOVL flags+16(FP), DX
MOVQ $75, AX // madvise MOVQ $75, AX // madvise
SYSCALL SYSCALL
// ignore failure - maybe pages are locked JCC 2(PC)
MOVL $-1, AX
MOVL AX, ret+24(FP)
RET RET
TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
......
...@@ -269,7 +269,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -269,7 +269,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVW flags+8(FP), R2 // arg 3 flags MOVW flags+8(FP), R2 // arg 3 flags
MOVW $SYS_madvise, R7 MOVW $SYS_madvise, R7
SWI $0 SWI $0
// ignore failure - maybe pages are locked MOVW.CS $-1, R0
MOVW R0, ret+12(FP)
RET RET
TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0 TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
......
...@@ -427,7 +427,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -427,7 +427,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL n+4(FP), CX MOVL n+4(FP), CX
MOVL flags+8(FP), DX MOVL flags+8(FP), DX
INVOKE_SYSCALL INVOKE_SYSCALL
// ignore failure - maybe pages are locked MOVL AX, ret+12(FP)
RET RET
// int32 futex(int32 *uaddr, int32 op, int32 val, // int32 futex(int32 *uaddr, int32 op, int32 val,
......
...@@ -519,7 +519,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -519,7 +519,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX MOVL flags+16(FP), DX
MOVQ $SYS_madvise, AX MOVQ $SYS_madvise, AX
SYSCALL SYSCALL
// ignore failure - maybe pages are locked MOVL AX, ret+24(FP)
RET RET
// int64 futex(int32 *uaddr, int32 op, int32 val, // int64 futex(int32 *uaddr, int32 op, int32 val,
......
...@@ -195,7 +195,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -195,7 +195,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVW flags+8(FP), R2 MOVW flags+8(FP), R2
MOVW $SYS_madvise, R7 MOVW $SYS_madvise, R7
SWI $0 SWI $0
// ignore failure - maybe pages are locked MOVW R0, ret+12(FP)
RET RET
TEXT runtime·setitimer(SB),NOSPLIT,$0 TEXT runtime·setitimer(SB),NOSPLIT,$0
......
...@@ -401,7 +401,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 ...@@ -401,7 +401,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
MOVW flags+16(FP), R2 MOVW flags+16(FP), R2
MOVD $SYS_madvise, R8 MOVD $SYS_madvise, R8
SVC SVC
// ignore failure - maybe pages are locked MOVW R0, ret+24(FP)
RET RET
// int64 futex(int32 *uaddr, int32 op, int32 val, // int64 futex(int32 *uaddr, int32 op, int32 val,
......
...@@ -291,7 +291,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 ...@@ -291,7 +291,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
MOVW flags+16(FP), R6 MOVW flags+16(FP), R6
MOVV $SYS_madvise, R2 MOVV $SYS_madvise, R2
SYSCALL SYSCALL
// ignore failure - maybe pages are locked MOVW R2, ret+24(FP)
RET RET
// int64 futex(int32 *uaddr, int32 op, int32 val, // int64 futex(int32 *uaddr, int32 op, int32 val,
......
...@@ -302,13 +302,13 @@ TEXT runtime·munmap(SB),NOSPLIT,$0-8 ...@@ -302,13 +302,13 @@ TEXT runtime·munmap(SB),NOSPLIT,$0-8
UNDEF // crash UNDEF // crash
RET RET
TEXT runtime·madvise(SB),NOSPLIT,$0-12 TEXT runtime·madvise(SB),NOSPLIT,$0-16
MOVW addr+0(FP), R4 MOVW addr+0(FP), R4
MOVW n+4(FP), R5 MOVW n+4(FP), R5
MOVW flags+8(FP), R6 MOVW flags+8(FP), R6
MOVW $SYS_madvise, R2 MOVW $SYS_madvise, R2
SYSCALL SYSCALL
// ignore failure - maybe pages are locked MOVW R2, ret+12(FP)
RET RET
// int32 futex(int32 *uaddr, int32 op, int32 val, struct timespec *timeout, int32 *uaddr2, int32 val2); // int32 futex(int32 *uaddr, int32 op, int32 val, struct timespec *timeout, int32 *uaddr2, int32 val2);
......
...@@ -454,7 +454,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 ...@@ -454,7 +454,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
MOVD n+8(FP), R4 MOVD n+8(FP), R4
MOVW flags+16(FP), R5 MOVW flags+16(FP), R5
SYSCALL $SYS_madvise SYSCALL $SYS_madvise
// ignore failure - maybe pages are locked MOVW R3, ret+24(FP)
RET RET
// int64 futex(int32 *uaddr, int32 op, int32 val, // int64 futex(int32 *uaddr, int32 op, int32 val,
......
...@@ -290,7 +290,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0 ...@@ -290,7 +290,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
MOVW flags+16(FP), R4 MOVW flags+16(FP), R4
MOVW $SYS_madvise, R1 MOVW $SYS_madvise, R1
SYSCALL SYSCALL
// ignore failure - maybe pages are locked MOVW R2, ret+24(FP)
RET RET
// int64 futex(int32 *uaddr, int32 op, int32 val, // int64 futex(int32 *uaddr, int32 op, int32 val,
......
...@@ -135,7 +135,9 @@ TEXT runtime·munmap(SB),NOSPLIT,$-4 ...@@ -135,7 +135,9 @@ TEXT runtime·munmap(SB),NOSPLIT,$-4
TEXT runtime·madvise(SB),NOSPLIT,$-4 TEXT runtime·madvise(SB),NOSPLIT,$-4
MOVL $75, AX // sys_madvise MOVL $75, AX // sys_madvise
INT $0x80 INT $0x80
// ignore failure - maybe pages are locked JAE 2(PC)
MOVL $-1, AX
MOVL AX, ret+12(FP)
RET RET
TEXT runtime·setitimer(SB),NOSPLIT,$-4 TEXT runtime·setitimer(SB),NOSPLIT,$-4
......
...@@ -319,7 +319,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -319,7 +319,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX // arg 3 - behav MOVL flags+16(FP), DX // arg 3 - behav
MOVQ $75, AX // sys_madvise MOVQ $75, AX // sys_madvise
SYSCALL SYSCALL
// ignore failure - maybe pages are locked JCC 2(PC)
MOVL $-1, AX
MOVL AX, ret+24(FP)
RET RET
TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
......
...@@ -288,7 +288,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -288,7 +288,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVW n+4(FP), R1 // arg 2 - len MOVW n+4(FP), R1 // arg 2 - len
MOVW flags+8(FP), R2 // arg 3 - behav MOVW flags+8(FP), R2 // arg 3 - behav
SWI $0xa0004b // sys_madvise SWI $0xa0004b // sys_madvise
// ignore failure - maybe pages are locked MOVW.CS $-1, R0
MOVW R0, ret+12(FP)
RET RET
TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0 TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
......
...@@ -136,7 +136,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$-4 ...@@ -136,7 +136,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$-4
MOVL $75, AX // sys_madvise MOVL $75, AX // sys_madvise
INT $0x80 INT $0x80
JAE 2(PC) JAE 2(PC)
MOVL $0xf1, 0xf1 // crash MOVL $-1, AX
MOVL AX, ret+12(FP)
RET RET
TEXT runtime·setitimer(SB),NOSPLIT,$-4 TEXT runtime·setitimer(SB),NOSPLIT,$-4
......
...@@ -305,7 +305,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -305,7 +305,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX // arg 3 - behav MOVL flags+16(FP), DX // arg 3 - behav
MOVQ $75, AX // sys_madvise MOVQ $75, AX // sys_madvise
SYSCALL SYSCALL
// ignore failure - maybe pages are locked JCC 2(PC)
MOVL $-1, AX
MOVL AX, ret+24(FP)
RET RET
TEXT runtime·sigaltstack(SB),NOSPLIT,$-8 TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
......
...@@ -143,8 +143,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$0 ...@@ -143,8 +143,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVW flags+8(FP), R2 // arg 2 - flags MOVW flags+8(FP), R2 // arg 2 - flags
MOVW $75, R12 // sys_madvise MOVW $75, R12 // sys_madvise
SWI $0 SWI $0
MOVW.CS $0, R8 // crash on syscall failure MOVW.CS $-1, R0
MOVW.CS R8, (R8) MOVW R0, ret+12(FP)
RET RET
TEXT runtime·setitimer(SB),NOSPLIT,$0 TEXT runtime·setitimer(SB),NOSPLIT,$0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment