Commit 77f9b272 authored by Tobias Klauser's avatar Tobias Klauser Committed by Tobias Klauser

runtime: use MADV_FREE on Linux if available

On Linux, sysUnused currently uses madvise(MADV_DONTNEED) to signal the
kernel that a range of allocated memory contains unneeded data. After a
successful call, the range (but not the data it contained before the
call to madvise) is still available but the first access to that range
will unconditionally incur a page fault (needed to 0-fill the range).

A faster alternative is MADV_FREE, available since Linux 4.5. The
mechanism is very similar, but the page fault will only be incurred if
the kernel, between the call to madvise and the first access, decides to
reuse that memory for something else.

In sysUnused, test whether MADV_FREE is supported and fall back to
MADV_DONTNEED in case it isn't. This requires making the return value of
the madvise syscall available to the caller, so change runtime.madvise
to return it.

Fixes #23687

Change-Id: I962c3429000dd9f4a00846461ad128b71201bb04
Reviewed-on: https://go-review.googlesource.com/135395
Run-TryBot: Tobias Klauser <tobias.klauser@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
parent a0f5d5f8
......@@ -58,7 +58,10 @@ const (
MAP_PRIVATE = C.MAP_PRIVATE
MAP_FIXED = C.MAP_FIXED
MADV_DONTNEED = C.MADV_DONTNEED
MADV_DONTNEED = C.MADV_DONTNEED
MADV_FREE = C.MADV_FREE
MADV_HUGEPAGE = C.MADV_HUGEPAGE
MADV_NOHUGEPAGE = C.MADV_HNOUGEPAGE
SA_RESTART = C.SA_RESTART
SA_ONSTACK = C.SA_ONSTACK
......
......@@ -47,7 +47,10 @@ const (
MAP_PRIVATE = C.MAP_PRIVATE
MAP_FIXED = C.MAP_FIXED
MADV_DONTNEED = C.MADV_DONTNEED
MADV_DONTNEED = C.MADV_DONTNEED
MADV_FREE = C.MADV_FREE
MADV_HUGEPAGE = C.MADV_HUGEPAGE
MADV_NOHUGEPAGE = C.MADV_HNOUGEPAGE
SA_RESTART = C.SA_RESTART
SA_ONSTACK = C.SA_ONSTACK
......
......@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -16,6 +16,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -22,6 +22,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -18,6 +18,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -19,6 +19,7 @@ const (
_MAP_FIXED = 0x10
_MADV_DONTNEED = 0x4
_MADV_FREE = 0x8
_MADV_HUGEPAGE = 0xe
_MADV_NOHUGEPAGE = 0xf
......
......@@ -5,6 +5,7 @@
package runtime
import (
"runtime/internal/atomic"
"runtime/internal/sys"
"unsafe"
)
......@@ -34,10 +35,12 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
return p
}
var adviseUnused = uint32(_MADV_FREE)
func sysUnused(v unsafe.Pointer, n uintptr) {
// By default, Linux's "transparent huge page" support will
// merge pages into a huge page if there's even a single
// present regular page, undoing the effects of the DONTNEED
// present regular page, undoing the effects of madvise(adviseUnused)
// below. On amd64, that means khugepaged can turn a single
// 4KB page to 2MB, bloating the process's RSS by as much as
// 512X. (See issue #8832 and Linux kernel bug
......@@ -102,7 +105,13 @@ func sysUnused(v unsafe.Pointer, n uintptr) {
throw("unaligned sysUnused")
}
madvise(v, n, _MADV_DONTNEED)
advise := atomic.Load(&adviseUnused)
if errno := madvise(v, n, int32(advise)); advise == _MADV_FREE && errno != 0 {
// MADV_FREE was added in Linux 4.5. Fall back to MADV_DONTNEED if it is
// not supported.
atomic.Store(&adviseUnused, _MADV_DONTNEED)
madvise(v, n, _MADV_DONTNEED)
}
}
func sysUsed(v unsafe.Pointer, n uintptr) {
......
......@@ -25,7 +25,8 @@ func write(fd uintptr, p unsafe.Pointer, n int32) int32
//go:noescape
func open(name *byte, mode, perm int32) int32
func madvise(addr unsafe.Pointer, n uintptr, flags int32)
// return value is only set on linux to be used in osinit()
func madvise(addr unsafe.Pointer, n uintptr, flags int32) int32
// exitThread terminates the current thread, writing *wait = 0 when
// the stack is safe to reclaim.
......
......@@ -260,9 +260,11 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX
MOVQ $75, AX // madvise
SYSCALL
// ignore failure - maybe pages are locked
JCC 2(PC)
MOVL $-1, AX
MOVL AX, ret+24(FP)
RET
TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
MOVQ new+0(FP), DI
MOVQ old+8(FP), SI
......
......@@ -163,7 +163,9 @@ TEXT runtime·munmap(SB),NOSPLIT,$-4
TEXT runtime·madvise(SB),NOSPLIT,$-4
MOVL $75, AX // madvise
INT $0x80
// ignore failure - maybe pages are locked
JAE 2(PC)
MOVL $-1, AX
MOVL AX, ret+12(FP)
RET
TEXT runtime·setitimer(SB), NOSPLIT, $-4
......
......@@ -337,9 +337,11 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX
MOVQ $75, AX // madvise
SYSCALL
// ignore failure - maybe pages are locked
JCC 2(PC)
MOVL $-1, AX
MOVL AX, ret+24(FP)
RET
TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
MOVQ new+0(FP), DI
MOVQ old+8(FP), SI
......
......@@ -264,14 +264,15 @@ TEXT runtime·munmap(SB),NOSPLIT,$0
RET
TEXT runtime·madvise(SB),NOSPLIT,$0
MOVW addr+0(FP), R0 // arg 1 addr
MOVW n+4(FP), R1 // arg 2 len
MOVW flags+8(FP), R2 // arg 3 flags
MOVW $SYS_madvise, R7
SWI $0
// ignore failure - maybe pages are locked
MOVW addr+0(FP), R0 // arg 1 addr
MOVW n+4(FP), R1 // arg 2 len
MOVW flags+8(FP), R2 // arg 3 flags
MOVW $SYS_madvise, R7
SWI $0
MOVW.CS $-1, R0
MOVW R0, ret+12(FP)
RET
TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
MOVW new+0(FP), R0
MOVW old+4(FP), R1
......
......@@ -427,7 +427,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL n+4(FP), CX
MOVL flags+8(FP), DX
INVOKE_SYSCALL
// ignore failure - maybe pages are locked
MOVL AX, ret+12(FP)
RET
// int32 futex(int32 *uaddr, int32 op, int32 val,
......
......@@ -519,7 +519,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX
MOVQ $SYS_madvise, AX
SYSCALL
// ignore failure - maybe pages are locked
MOVL AX, ret+24(FP)
RET
// int64 futex(int32 *uaddr, int32 op, int32 val,
......
......@@ -195,7 +195,7 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVW flags+8(FP), R2
MOVW $SYS_madvise, R7
SWI $0
// ignore failure - maybe pages are locked
MOVW R0, ret+12(FP)
RET
TEXT runtime·setitimer(SB),NOSPLIT,$0
......
......@@ -401,7 +401,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
MOVW flags+16(FP), R2
MOVD $SYS_madvise, R8
SVC
// ignore failure - maybe pages are locked
MOVW R0, ret+24(FP)
RET
// int64 futex(int32 *uaddr, int32 op, int32 val,
......
......@@ -291,7 +291,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
MOVW flags+16(FP), R6
MOVV $SYS_madvise, R2
SYSCALL
// ignore failure - maybe pages are locked
MOVW R2, ret+24(FP)
RET
// int64 futex(int32 *uaddr, int32 op, int32 val,
......
......@@ -302,13 +302,13 @@ TEXT runtime·munmap(SB),NOSPLIT,$0-8
UNDEF // crash
RET
TEXT runtime·madvise(SB),NOSPLIT,$0-12
TEXT runtime·madvise(SB),NOSPLIT,$0-16
MOVW addr+0(FP), R4
MOVW n+4(FP), R5
MOVW flags+8(FP), R6
MOVW $SYS_madvise, R2
SYSCALL
// ignore failure - maybe pages are locked
MOVW R2, ret+12(FP)
RET
// int32 futex(int32 *uaddr, int32 op, int32 val, struct timespec *timeout, int32 *uaddr2, int32 val2);
......
......@@ -454,7 +454,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
MOVD n+8(FP), R4
MOVW flags+16(FP), R5
SYSCALL $SYS_madvise
// ignore failure - maybe pages are locked
MOVW R3, ret+24(FP)
RET
// int64 futex(int32 *uaddr, int32 op, int32 val,
......
......@@ -290,7 +290,7 @@ TEXT runtime·madvise(SB),NOSPLIT|NOFRAME,$0
MOVW flags+16(FP), R4
MOVW $SYS_madvise, R1
SYSCALL
// ignore failure - maybe pages are locked
MOVW R2, ret+24(FP)
RET
// int64 futex(int32 *uaddr, int32 op, int32 val,
......
......@@ -135,7 +135,9 @@ TEXT runtime·munmap(SB),NOSPLIT,$-4
TEXT runtime·madvise(SB),NOSPLIT,$-4
MOVL $75, AX // sys_madvise
INT $0x80
// ignore failure - maybe pages are locked
JAE 2(PC)
MOVL $-1, AX
MOVL AX, ret+12(FP)
RET
TEXT runtime·setitimer(SB),NOSPLIT,$-4
......
......@@ -319,7 +319,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX // arg 3 - behav
MOVQ $75, AX // sys_madvise
SYSCALL
// ignore failure - maybe pages are locked
JCC 2(PC)
MOVL $-1, AX
MOVL AX, ret+24(FP)
RET
TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
......
......@@ -284,11 +284,12 @@ TEXT runtime·munmap(SB),NOSPLIT,$0
RET
TEXT runtime·madvise(SB),NOSPLIT,$0
MOVW addr+0(FP), R0 // arg 1 - addr
MOVW n+4(FP), R1 // arg 2 - len
MOVW flags+8(FP), R2 // arg 3 - behav
SWI $0xa0004b // sys_madvise
// ignore failure - maybe pages are locked
MOVW addr+0(FP), R0 // arg 1 - addr
MOVW n+4(FP), R1 // arg 2 - len
MOVW flags+8(FP), R2 // arg 3 - behav
SWI $0xa0004b // sys_madvise
MOVW.CS $-1, R0
MOVW R0, ret+12(FP)
RET
TEXT runtime·sigaltstack(SB),NOSPLIT|NOFRAME,$0
......
......@@ -136,7 +136,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$-4
MOVL $75, AX // sys_madvise
INT $0x80
JAE 2(PC)
MOVL $0xf1, 0xf1 // crash
MOVL $-1, AX
MOVL AX, ret+12(FP)
RET
TEXT runtime·setitimer(SB),NOSPLIT,$-4
......
......@@ -305,7 +305,9 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVL flags+16(FP), DX // arg 3 - behav
MOVQ $75, AX // sys_madvise
SYSCALL
// ignore failure - maybe pages are locked
JCC 2(PC)
MOVL $-1, AX
MOVL AX, ret+24(FP)
RET
TEXT runtime·sigaltstack(SB),NOSPLIT,$-8
......
......@@ -143,8 +143,8 @@ TEXT runtime·madvise(SB),NOSPLIT,$0
MOVW flags+8(FP), R2 // arg 2 - flags
MOVW $75, R12 // sys_madvise
SWI $0
MOVW.CS $0, R8 // crash on syscall failure
MOVW.CS R8, (R8)
MOVW.CS $-1, R0
MOVW R0, ret+12(FP)
RET
TEXT runtime·setitimer(SB),NOSPLIT,$0
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment