Commit 49e03008 authored by Dmitriy Vyukov's avatar Dmitriy Vyukov

runtime: integrated network poller for linux

vs tip:
BenchmarkTCP4OneShot                    172994        40485  -76.60%
BenchmarkTCP4OneShot-2                   96581        30028  -68.91%
BenchmarkTCP4OneShot-4                   52615        18454  -64.93%
BenchmarkTCP4OneShot-8                   26351        12289  -53.36%
BenchmarkTCP4OneShot-16                  12258        16093  +31.29%
BenchmarkTCP4OneShot-32                  13200        17045  +29.13%

BenchmarkTCP4OneShotTimeout             124814        42932  -65.60%
BenchmarkTCP4OneShotTimeout-2            99090        29040  -70.69%
BenchmarkTCP4OneShotTimeout-4            51860        18455  -64.41%
BenchmarkTCP4OneShotTimeout-8            26100        12073  -53.74%
BenchmarkTCP4OneShotTimeout-16           12198        16654  +36.53%
BenchmarkTCP4OneShotTimeout-32           13438        17143  +27.57%

BenchmarkTCP4Persistent                 115647         7782  -93.27%
BenchmarkTCP4Persistent-2                58024         4808  -91.71%
BenchmarkTCP4Persistent-4                24715         3674  -85.13%
BenchmarkTCP4Persistent-8                16431         2407  -85.35%
BenchmarkTCP4Persistent-16                2336         1875  -19.73%
BenchmarkTCP4Persistent-32                1689         1637   -3.08%

BenchmarkTCP4PersistentTimeout           79754         7859  -90.15%
BenchmarkTCP4PersistentTimeout-2         57708         5952  -89.69%
BenchmarkTCP4PersistentTimeout-4         26907         3823  -85.79%
BenchmarkTCP4PersistentTimeout-8         15036         2567  -82.93%
BenchmarkTCP4PersistentTimeout-16         2507         1903  -24.09%
BenchmarkTCP4PersistentTimeout-32         1717         1627   -5.24%

vs old scheduler:
benchmark                           old ns/op    new ns/op    delta
BenchmarkTCPOneShot                    192244        40485  -78.94%
BenchmarkTCPOneShot-2                   63835        30028  -52.96%
BenchmarkTCPOneShot-4                   35443        18454  -47.93%
BenchmarkTCPOneShot-8                   22140        12289  -44.49%
BenchmarkTCPOneShot-16                  16930        16093   -4.94%
BenchmarkTCPOneShot-32                  16719        17045   +1.95%

BenchmarkTCPOneShotTimeout             190495        42932  -77.46%
BenchmarkTCPOneShotTimeout-2            64828        29040  -55.20%
BenchmarkTCPOneShotTimeout-4            34591        18455  -46.65%
BenchmarkTCPOneShotTimeout-8            21989        12073  -45.10%
BenchmarkTCPOneShotTimeout-16           16848        16654   -1.15%
BenchmarkTCPOneShotTimeout-32           16796        17143   +2.07%

BenchmarkTCPPersistent                  81670         7782  -90.47%
BenchmarkTCPPersistent-2                26598         4808  -81.92%
BenchmarkTCPPersistent-4                15633         3674  -76.50%
BenchmarkTCPPersistent-8                18093         2407  -86.70%
BenchmarkTCPPersistent-16               17472         1875  -89.27%
BenchmarkTCPPersistent-32                7679         1637  -78.68%

BenchmarkTCPPersistentTimeout           83186         7859  -90.55%
BenchmarkTCPPersistentTimeout-2         26883         5952  -77.86%
BenchmarkTCPPersistentTimeout-4         15776         3823  -75.77%
BenchmarkTCPPersistentTimeout-8         18180         2567  -85.88%
BenchmarkTCPPersistentTimeout-16        17454         1903  -89.10%
BenchmarkTCPPersistentTimeout-32         7798         1627  -79.14%

R=golang-dev, iant, bradfitz, dave, rsc
CC=golang-dev
https://golang.org/cl/7579044
parent 4dd1b899
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Waiting for FDs via epoll(7).
package net
import (
"os"
"syscall"
)
const (
readFlags = syscall.EPOLLIN | syscall.EPOLLRDHUP
writeFlags = syscall.EPOLLOUT
)
type pollster struct {
epfd int
// Events we're already waiting for
// Must hold pollServer lock
events map[int]uint32
// An event buffer for EpollWait.
// Used without a lock, may only be used by WaitFD.
waitEventBuf [10]syscall.EpollEvent
waitEvents []syscall.EpollEvent
// An event buffer for EpollCtl, to avoid a malloc.
// Must hold pollServer lock.
ctlEvent syscall.EpollEvent
}
func newpollster() (p *pollster, err error) {
p = new(pollster)
if p.epfd, err = syscall.EpollCreate1(syscall.EPOLL_CLOEXEC); err != nil {
if err != syscall.ENOSYS {
return nil, os.NewSyscallError("epoll_create1", err)
}
// The arg to epoll_create is a hint to the kernel
// about the number of FDs we will care about.
// We don't know, and since 2.6.8 the kernel ignores it anyhow.
if p.epfd, err = syscall.EpollCreate(16); err != nil {
return nil, os.NewSyscallError("epoll_create", err)
}
syscall.CloseOnExec(p.epfd)
}
p.events = make(map[int]uint32)
return p, nil
}
// First return value is whether the pollServer should be woken up.
// This version always returns false.
func (p *pollster) AddFD(fd int, mode int, repeat bool) (bool, error) {
// pollServer is locked.
var already bool
p.ctlEvent.Fd = int32(fd)
p.ctlEvent.Events, already = p.events[fd]
if !repeat {
p.ctlEvent.Events |= syscall.EPOLLONESHOT
}
if mode == 'r' {
p.ctlEvent.Events |= readFlags
} else {
p.ctlEvent.Events |= writeFlags
}
var op int
if already {
op = syscall.EPOLL_CTL_MOD
} else {
op = syscall.EPOLL_CTL_ADD
}
if err := syscall.EpollCtl(p.epfd, op, fd, &p.ctlEvent); err != nil {
return false, os.NewSyscallError("epoll_ctl", err)
}
p.events[fd] = p.ctlEvent.Events
return false, nil
}
func (p *pollster) StopWaiting(fd int, bits uint) {
// pollServer is locked.
events, already := p.events[fd]
if !already {
// The fd returned by the kernel may have been
// cancelled already; return silently.
return
}
// If syscall.EPOLLONESHOT is not set, the wait
// is a repeating wait, so don't change it.
if events&syscall.EPOLLONESHOT == 0 {
return
}
// Disable the given bits.
// If we're still waiting for other events, modify the fd
// event in the kernel. Otherwise, delete it.
events &= ^uint32(bits)
if int32(events)&^syscall.EPOLLONESHOT != 0 {
p.ctlEvent.Fd = int32(fd)
p.ctlEvent.Events = events
if err := syscall.EpollCtl(p.epfd, syscall.EPOLL_CTL_MOD, fd, &p.ctlEvent); err != nil {
print("Epoll modify fd=", fd, ": ", err.Error(), "\n")
}
p.events[fd] = events
} else {
if err := syscall.EpollCtl(p.epfd, syscall.EPOLL_CTL_DEL, fd, nil); err != nil {
print("Epoll delete fd=", fd, ": ", err.Error(), "\n")
}
delete(p.events, fd)
}
}
// Return value is whether the pollServer should be woken up.
// This version always returns false.
func (p *pollster) DelFD(fd int, mode int) bool {
// pollServer is locked.
if mode == 'r' {
p.StopWaiting(fd, readFlags)
} else {
p.StopWaiting(fd, writeFlags)
}
// Discard any queued up events.
i := 0
for i < len(p.waitEvents) {
if fd == int(p.waitEvents[i].Fd) {
copy(p.waitEvents[i:], p.waitEvents[i+1:])
p.waitEvents = p.waitEvents[:len(p.waitEvents)-1]
} else {
i++
}
}
return false
}
func (p *pollster) WaitFD(s *pollServer, nsec int64) (fd int, mode int, err error) {
for len(p.waitEvents) == 0 {
var msec int = -1
if nsec > 0 {
msec = int((nsec + 1e6 - 1) / 1e6)
}
s.Unlock()
n, err := syscall.EpollWait(p.epfd, p.waitEventBuf[0:], msec)
s.Lock()
if err != nil {
if err == syscall.EAGAIN || err == syscall.EINTR {
continue
}
return -1, 0, os.NewSyscallError("epoll_wait", err)
}
if n == 0 {
return -1, 0, nil
}
p.waitEvents = p.waitEventBuf[0:n]
}
ev := &p.waitEvents[0]
p.waitEvents = p.waitEvents[1:]
fd = int(ev.Fd)
if ev.Events&writeFlags != 0 {
p.StopWaiting(fd, writeFlags)
return fd, 'w', nil
}
if ev.Events&readFlags != 0 {
p.StopWaiting(fd, readFlags)
return fd, 'r', nil
}
// Other events are error conditions - wake whoever is waiting.
events, _ := p.events[fd]
if events&writeFlags != 0 {
p.StopWaiting(fd, writeFlags)
return fd, 'w', nil
}
p.StopWaiting(fd, readFlags)
return fd, 'r', nil
}
func (p *pollster) Close() error {
return os.NewSyscallError("close", syscall.Close(p.epfd))
}
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build darwin
// +build darwin linux,386 linux,amd64
package net
......
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build freebsd linux netbsd openbsd
// +build freebsd linux,arm netbsd openbsd
package net
......
......@@ -7,7 +7,7 @@
/*
* Input to cgo -cdefs
GOARCH=386 cgo -cdefs defs2.go >386/defs.h
GOARCH=386 go tool cgo -cdefs defs2_linux.go >defs_linux_386.h
The asm header tricks we have to use for Linux on amd64
(see defs.c and defs1.c) don't work here, so this is yet another
......@@ -17,15 +17,19 @@ file. Sigh.
package runtime
/*
#cgo CFLAGS: -I/home/rsc/pub/linux-2.6/arch/x86/include -I/home/rsc/pub/linux-2.6/include -D_LOOSE_KERNEL_NAMES -D__ARCH_SI_UID_T=__kernel_uid32_t
#cgo CFLAGS: -I/tmp/linux/arch/x86/include -I/tmp/linux/include -D_LOOSE_KERNEL_NAMES -D__ARCH_SI_UID_T=__kernel_uid32_t
#define size_t __kernel_size_t
#define pid_t int
#include <asm/signal.h>
#include <asm/mman.h>
#include <asm/sigcontext.h>
#include <asm/ucontext.h>
#include <asm/siginfo.h>
#include <asm-generic/errno.h>
#include <asm-generic/fcntl.h>
#include <asm-generic/poll.h>
#include <linux/eventpoll.h>
// This is the sigaction structure from the Linux 2.1.68 kernel which
// is used with the rt_sigaction system call. For 386 this is not
......@@ -35,12 +39,16 @@ struct kernel_sigaction {
__sighandler_t k_sa_handler;
unsigned long sa_flags;
void (*sa_restorer) (void);
sigset_t sa_mask;
unsigned long long sa_mask;
};
*/
import "C"
const (
EINTR = C.EINTR
EAGAIN = C.EAGAIN
ENOMEM = C.ENOMEM
PROT_NONE = C.PROT_NONE
PROT_READ = C.PROT_READ
PROT_WRITE = C.PROT_WRITE
......@@ -110,6 +118,17 @@ const (
O_RDONLY = C.O_RDONLY
O_CLOEXEC = C.O_CLOEXEC
EPOLLIN = C.POLLIN
EPOLLOUT = C.POLLOUT
EPOLLERR = C.POLLERR
EPOLLHUP = C.POLLHUP
EPOLLRDHUP = C.POLLRDHUP
EPOLLET = C.EPOLLET
EPOLL_CLOEXEC = C.EPOLL_CLOEXEC
EPOLL_CTL_ADD = C.EPOLL_CTL_ADD
EPOLL_CTL_DEL = C.EPOLL_CTL_DEL
EPOLL_CTL_MOD = C.EPOLL_CTL_MOD
)
type Fpreg C.struct__fpreg
......@@ -124,3 +143,4 @@ type Sigaltstack C.struct_sigaltstack
type Sigcontext C.struct_sigcontext
type Ucontext C.struct_ucontext
type Itimerval C.struct_itimerval
type EpollEvent C.struct_epoll_event
......@@ -7,7 +7,7 @@
/*
Input to cgo -cdefs
GOARCH=amd64 cgo -cdefs defs.go defs1.go >amd64/defs.h
GOARCH=amd64 go tool cgo -cdefs defs_linux.go defs1_linux.go >defs_linux_amd64.h
*/
package runtime
......@@ -25,10 +25,17 @@ package runtime
#include <asm/signal.h>
#include <asm/siginfo.h>
#include <asm/mman.h>
#include <asm-generic/errno.h>
#include <asm-generic/poll.h>
#include <linux/eventpoll.h>
*/
import "C"
const (
EINTR = C.EINTR
EAGAIN = C.EAGAIN
ENOMEM = C.ENOMEM
PROT_NONE = C.PROT_NONE
PROT_READ = C.PROT_READ
PROT_WRITE = C.PROT_WRITE
......@@ -95,6 +102,17 @@ const (
ITIMER_REAL = C.ITIMER_REAL
ITIMER_VIRTUAL = C.ITIMER_VIRTUAL
ITIMER_PROF = C.ITIMER_PROF
EPOLLIN = C.POLLIN
EPOLLOUT = C.POLLOUT
EPOLLERR = C.POLLERR
EPOLLHUP = C.POLLHUP
EPOLLRDHUP = C.POLLRDHUP
EPOLLET = C.EPOLLET
EPOLL_CLOEXEC = C.EPOLL_CLOEXEC
EPOLL_CTL_ADD = C.EPOLL_CTL_ADD
EPOLL_CTL_DEL = C.EPOLL_CTL_DEL
EPOLL_CTL_MOD = C.EPOLL_CTL_MOD
)
type Timespec C.struct_timespec
......@@ -102,3 +120,4 @@ type Timeval C.struct_timeval
type Sigaction C.struct_sigaction
type Siginfo C.siginfo_t
type Itimerval C.struct_itimerval
type EpollEvent C.struct_epoll_event
// Created by cgo -cdefs - DO NOT EDIT
// cgo -cdefs defs2.go
// cgo -cdefs defs2_linux.go
enum {
EINTR = 0x4,
EAGAIN = 0xb,
ENOMEM = 0xc,
PROT_NONE = 0x0,
PROT_READ = 0x1,
PROT_WRITE = 0x2,
......@@ -72,6 +76,17 @@ enum {
O_RDONLY = 0x0,
O_CLOEXEC = 0x80000,
EPOLLIN = 0x1,
EPOLLOUT = 0x4,
EPOLLERR = 0x8,
EPOLLHUP = 0x10,
EPOLLRDHUP = 0x2000,
EPOLLET = -0x80000000,
EPOLL_CLOEXEC = 0x80000,
EPOLL_CTL_ADD = 0x1,
EPOLL_CTL_DEL = 0x2,
EPOLL_CTL_MOD = 0x3,
};
typedef struct Fpreg Fpreg;
......@@ -86,6 +101,7 @@ typedef struct Sigaltstack Sigaltstack;
typedef struct Sigcontext Sigcontext;
typedef struct Ucontext Ucontext;
typedef struct Itimerval Itimerval;
typedef struct EpollEvent EpollEvent;
#pragma pack on
......@@ -186,6 +202,10 @@ struct Itimerval {
Timeval it_interval;
Timeval it_value;
};
struct EpollEvent {
uint32 events;
uint64 data;
};
#pragma pack off
// Created by cgo -cdefs - DO NOT EDIT
// cgo -cdefs defs.go defs1.go
// cgo -cdefs defs_linux.go defs1_linux.go
enum {
EINTR = 0x4,
EAGAIN = 0xb,
ENOMEM = 0xc,
PROT_NONE = 0x0,
PROT_READ = 0x1,
PROT_WRITE = 0x2,
......@@ -69,6 +73,17 @@ enum {
ITIMER_REAL = 0x0,
ITIMER_VIRTUAL = 0x1,
ITIMER_PROF = 0x2,
EPOLLIN = 0x1,
EPOLLOUT = 0x4,
EPOLLERR = 0x8,
EPOLLHUP = 0x10,
EPOLLRDHUP = 0x2000,
EPOLLET = -0x80000000,
EPOLL_CLOEXEC = 0x80000,
EPOLL_CTL_ADD = 0x1,
EPOLL_CTL_DEL = 0x2,
EPOLL_CTL_MOD = 0x3,
};
typedef struct Timespec Timespec;
......@@ -76,6 +91,7 @@ typedef struct Timeval Timeval;
typedef struct Sigaction Sigaction;
typedef struct Siginfo Siginfo;
typedef struct Itimerval Itimerval;
typedef struct EpollEvent EpollEvent;
#pragma pack on
......@@ -104,11 +120,15 @@ struct Itimerval {
Timeval it_interval;
Timeval it_value;
};
struct EpollEvent {
uint32 events;
uint64 data;
};
#pragma pack off
// Created by cgo -cdefs - DO NOT EDIT
// cgo -cdefs defs.go defs1.go
// cgo -cdefs defs_linux.go defs1_linux.go
enum {
......
......@@ -10,8 +10,6 @@
enum
{
EAGAIN = 11,
ENOMEM = 12,
_PAGE_SIZE = 4096,
};
......
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build darwin
// +build darwin linux,386 linux,amd64
package net
......
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build linux,386 linux,amd64
#include "runtime.h"
#include "defs_GOOS_GOARCH.h"
int32 runtime·epollcreate(int32 size);
int32 runtime·epollcreate1(int32 flags);
int32 runtime·epollctl(int32 epfd, int32 op, int32 fd, EpollEvent *ev);
int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
void runtime·closeonexec(int32 fd);
static int32 epfd = -1; // epoll descriptor
void
runtime·netpollinit(void)
{
epfd = runtime·epollcreate1(EPOLL_CLOEXEC);
if(epfd >= 0)
return;
epfd = runtime·epollcreate(1024);
if(epfd >= 0) {
runtime·closeonexec(epfd);
return;
}
runtime·printf("netpollinit: failed to create descriptor (%d)\n", -epfd);
runtime·throw("netpollinit: failed to create descriptor");
}
int32
runtime·netpollopen(int32 fd, PollDesc *pd)
{
EpollEvent ev;
ev.events = EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLET;
ev.data = (uint64)pd;
return runtime·epollctl(epfd, EPOLL_CTL_ADD, fd, &ev);
}
// polls for ready network connections
// returns list of goroutines that become runnable
G*
runtime·netpoll(bool block)
{
EpollEvent events[128], *ev;
int32 n, i, waitms, mode;
G *gp;
if(epfd == -1)
return nil;
waitms = -1;
if(!block)
waitms = 0;
retry:
n = runtime·epollwait(epfd, events, nelem(events), waitms);
if(n < 0) {
if(n != -EINTR)
runtime·printf("epollwait failed with %d\n", -n);
goto retry;
}
gp = nil;
for(i = 0; i < n; i++) {
ev = &events[i];
if(ev->events == 0)
continue;
mode = 0;
if(ev->events & (EPOLLIN|EPOLLRDHUP|EPOLLHUP|EPOLLERR))
mode += 'r';
if(ev->events & (EPOLLOUT|EPOLLHUP|EPOLLERR))
mode += 'w';
if(mode)
runtime·netpollready(&gp, (void*)ev->data, mode);
}
if(block && gp == nil)
goto retry;
return gp;
}
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build freebsd linux netbsd openbsd plan9 windows
// +build freebsd linux,arm netbsd openbsd plan9 windows
#include "runtime.h"
......
......@@ -430,3 +430,46 @@ TEXT runtime·sched_getaffinity(SB),7,$0
MOVL 12(SP), DX
CALL *runtime·_vdso(SB)
RET
// int32 runtime·epollcreate(int32 size);
TEXT runtime·epollcreate(SB),7,$0
MOVL $254, AX
MOVL 4(SP), BX
CALL *runtime·_vdso(SB)
RET
// int32 runtime·epollcreate1(int32 flags);
TEXT runtime·epollcreate1(SB),7,$0
MOVL $329, AX
MOVL 4(SP), BX
CALL *runtime·_vdso(SB)
RET
// int32 runtime·epollctl(int32 epfd, int32 op, int32 fd, EpollEvent *ev);
TEXT runtime·epollctl(SB),7,$0
MOVL $255, AX
MOVL 4(SP), BX
MOVL 8(SP), CX
MOVL 12(SP), DX
MOVL 16(SP), SI
CALL *runtime·_vdso(SB)
RET
// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
TEXT runtime·epollwait(SB),7,$0
MOVL $256, AX
MOVL 4(SP), BX
MOVL 8(SP), CX
MOVL 12(SP), DX
MOVL 16(SP), SI
CALL *runtime·_vdso(SB)
RET
// void runtime·closeonexec(int32 fd);
TEXT runtime·closeonexec(SB),7,$0
MOVL $55, AX // fcntl
MOVL 4(SP), BX // fd
MOVL $2, CX // F_SETFD
MOVL $1, DX // FD_CLOEXEC
CALL *runtime·_vdso(SB)
RET
......@@ -347,3 +347,46 @@ TEXT runtime·sched_getaffinity(SB),7,$0
MOVL $204, AX // syscall entry
SYSCALL
RET
// int32 runtime·epollcreate(int32 size);
TEXT runtime·epollcreate(SB),7,$0
MOVL 8(SP), DI
MOVL $213, AX // syscall entry
SYSCALL
RET
// int32 runtime·epollcreate1(int32 flags);
TEXT runtime·epollcreate1(SB),7,$0
MOVL 8(SP), DI
MOVL $291, AX // syscall entry
SYSCALL
RET
// int32 runtime·epollctl(int32 epfd, int32 op, int32 fd, EpollEvent *ev);
TEXT runtime·epollctl(SB),7,$0
MOVL 8(SP), DI
MOVL 12(SP), SI
MOVL 16(SP), DX
MOVQ 24(SP), R10
MOVL $233, AX // syscall entry
SYSCALL
RET
// int32 runtime·epollwait(int32 epfd, EpollEvent *ev, int32 nev, int32 timeout);
TEXT runtime·epollwait(SB),7,$0
MOVL 8(SP), DI
MOVQ 16(SP), SI
MOVL 24(SP), DX
MOVL 28(SP), R10
MOVL $232, AX // syscall entry
SYSCALL
RET
// void runtime·closeonexec(int32 fd);
TEXT runtime·closeonexec(SB),7,$0
MOVL 8(SP), DI // fd
MOVQ $2, SI // F_SETFD
MOVQ $1, DX // FD_CLOEXEC
MOVL $72, AX // fcntl
SYSCALL
RET
......@@ -25,9 +25,6 @@ enum
{
FUTEX_WAIT = 0,
FUTEX_WAKE = 1,
EINTR = 4,
EAGAIN = 11,
};
// Atomically,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment