Commit 32b94f13 authored by Daniel Morsing's avatar Daniel Morsing

runtime: move selectdone into g

Writing to selectdone on the stack of another goroutine meant a
pretty subtle dance between the select code and the stack copying
code. Instead move the selectdone variable into the g struct.

Change-Id: Id246aaf18077c625adef7ca2d62794afef1bdd1b
Reviewed-on: https://go-review.googlesource.com/53390Reviewed-by: default avatarAustin Clements <austin@google.com>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 89d74f54
......@@ -222,7 +222,7 @@ func chansend(c *hchan, ep unsafe.Pointer, block bool, callerpc uintptr) bool {
mysg.elem = ep
mysg.waitlink = nil
mysg.g = gp
mysg.selectdone = nil
mysg.isSelect = false
mysg.c = c
gp.waiting = mysg
gp.param = nil
......@@ -507,7 +507,7 @@ func chanrecv(c *hchan, ep unsafe.Pointer, block bool) (selected, received bool)
mysg.waitlink = nil
gp.waiting = mysg
mysg.g = gp
mysg.selectdone = nil
mysg.isSelect = false
mysg.c = c
gp.param = nil
c.recvq.enqueue(mysg)
......@@ -711,10 +711,16 @@ func (q *waitq) dequeue() *sudog {
sgp.next = nil // mark as removed (see dequeueSudog)
}
// if sgp participates in a select and is already signaled, ignore it
if sgp.selectdone != nil {
// claim the right to signal
if *sgp.selectdone != 0 || !atomic.Cas(sgp.selectdone, 0, 1) {
// if a goroutine was put on this queue because of a
// select, there is a small window between the goroutine
// being woken up by a different case and it grabbing the
// channel locks. Once it has the lock
// it removes itself from the queue, so we won't see it after that.
// We use a flag in the G struct to tell us when someone
// else has won the race to signal this goroutine but the goroutine
// hasn't removed itself from the queue yet.
if sgp.isSelect {
if !atomic.Cas(&sgp.g.selectDone, 0, 1) {
continue
}
}
......
......@@ -332,8 +332,8 @@ func releaseSudog(s *sudog) {
if s.elem != nil {
throw("runtime: sudog with non-nil elem")
}
if s.selectdone != nil {
throw("runtime: sudog with non-nil selectdone")
if s.isSelect {
throw("runtime: sudog with non-false isSelect")
}
if s.next != nil {
throw("runtime: sudog with non-nil next")
......
......@@ -272,11 +272,14 @@ type sudog struct {
// channel this sudog is blocking on. shrinkstack depends on
// this for sudogs involved in channel ops.
g *g
selectdone *uint32 // CAS to 1 to win select race (may point to stack)
next *sudog
prev *sudog
elem unsafe.Pointer // data element (may point to stack)
g *g
// isSelect indicates g is participating in a select, so
// g.selectDone must be CAS'd to win the wake-up race.
isSelect bool
next *sudog
prev *sudog
elem unsafe.Pointer // data element (may point to stack)
// The following fields are never accessed concurrently.
// For channels, waitlink is only accessed by g.
......@@ -367,6 +370,7 @@ type g struct {
cgoCtxt []uintptr // cgo traceback context
labels unsafe.Pointer // profiler labels
timer *timer // cached timer for time.Sleep
selectDone uint32 // are we participating in a select and did someone win the race?
// Per-G GC state
......
......@@ -286,7 +286,6 @@ func selectgo(sel *hselect) int {
var (
gp *g
done uint32
sg *sudog
c *hchan
k *scase
......@@ -353,7 +352,6 @@ loop:
// pass 2 - enqueue on all chans
gp = getg()
done = 0
if gp.waiting != nil {
throw("gp.waiting != nil")
}
......@@ -367,8 +365,7 @@ loop:
c = cas.c
sg := acquireSudog()
sg.g = gp
// Note: selectdone is adjusted for stack copies in stack1.go:adjustsudogs
sg.selectdone = (*uint32)(noescape(unsafe.Pointer(&done)))
sg.isSelect = true
// No stack splits between assigning elem and enqueuing
// sg on gp.waiting where copystack can find it.
sg.elem = cas.elem
......@@ -394,62 +391,9 @@ loop:
gp.param = nil
gopark(selparkcommit, nil, "select", traceEvGoBlockSelect, 1)
// While we were asleep, some goroutine came along and completed
// one of the cases in the select and woke us up (called ready).
// As part of that process, the goroutine did a cas on done above
// (aka *sg.selectdone for all queued sg) to win the right to
// complete the select. Now done = 1.
//
// If we copy (grow) our own stack, we will update the
// selectdone pointers inside the gp.waiting sudog list to point
// at the new stack. Another goroutine attempting to
// complete one of our (still linked in) select cases might
// see the new selectdone pointer (pointing at the new stack)
// before the new stack has real data; if the new stack has done = 0
// (before the old values are copied over), the goroutine might
// do a cas via sg.selectdone and incorrectly believe that it has
// won the right to complete the select, executing a second
// communication and attempting to wake us (call ready) again.
//
// Then things break.
//
// The best break is that the goroutine doing ready sees the
// _Gcopystack status and throws, as in #17007.
// A worse break would be for us to continue on, start running real code,
// block in a semaphore acquisition (sema.go), and have the other
// goroutine wake us up without having really acquired the semaphore.
// That would result in the goroutine spuriously running and then
// queue up another spurious wakeup when the semaphore really is ready.
// In general the situation can cascade until something notices the
// problem and causes a crash.
//
// A stack shrink does not have this problem, because it locks
// all the channels that are involved first, blocking out the
// possibility of a cas on selectdone.
//
// A stack growth before gopark above does not have this
// problem, because we hold those channel locks (released by
// selparkcommit).
//
// A stack growth after sellock below does not have this
// problem, because again we hold those channel locks.
//
// The only problem is a stack growth during sellock.
// To keep that from happening, run sellock on the system stack.
//
// It might be that we could avoid this if copystack copied the
// stack before calling adjustsudogs. In that case,
// syncadjustsudogs would need to recopy the tiny part that
// it copies today, resulting in a little bit of extra copying.
//
// An even better fix, not for the week before a release candidate,
// would be to put space in every sudog and make selectdone
// point at (say) the space in the first sudog.
systemstack(func() {
sellock(scases, lockorder)
})
sellock(scases, lockorder)
gp.selectDone = 0
sg = (*sudog)(gp.param)
gp.param = nil
......@@ -462,7 +406,7 @@ loop:
sglist = gp.waiting
// Clear all elem before unlinking from gp.waiting.
for sg1 := gp.waiting; sg1 != nil; sg1 = sg1.waitlink {
sg1.selectdone = nil
sg1.isSelect = false
sg1.elem = nil
sg1.c = nil
}
......
......@@ -751,7 +751,6 @@ func adjustsudogs(gp *g, adjinfo *adjustinfo) {
// might be in the stack.
for s := gp.waiting; s != nil; s = s.waitlink {
adjustpointer(adjinfo, unsafe.Pointer(&s.elem))
adjustpointer(adjinfo, unsafe.Pointer(&s.selectdone))
}
}
......@@ -768,10 +767,6 @@ func findsghi(gp *g, stk stack) uintptr {
if stk.lo <= p && p < stk.hi && p > sghi {
sghi = p
}
p = uintptr(unsafe.Pointer(sg.selectdone)) + unsafe.Sizeof(sg.selectdone)
if stk.lo <= p && p < stk.hi && p > sghi {
sghi = p
}
}
return sghi
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment