Commit 3c60e6e8 authored by Russ Cox's avatar Russ Cox

runtime: fix races in stack scan

This fixes a hang during runtime.TestTraceStress.
It also fixes double-scan of stacks, which leads to
stack barrier installation failures.

Both of these have shown up as flaky failures on the dashboard.

Fixes #10941.

Change-Id: Ia2a5991ce2c9f43ba06ae1c7032f7c898dc990e0
Reviewed-on: https://go-review.googlesource.com/11089Reviewed-by: default avatarAustin Clements <austin@google.com>
parent a2aaede3
...@@ -1450,7 +1450,7 @@ func gcResetGState() (numgs int) { ...@@ -1450,7 +1450,7 @@ func gcResetGState() (numgs int) {
// allgs doesn't change. // allgs doesn't change.
lock(&allglock) lock(&allglock)
for _, gp := range allgs { for _, gp := range allgs {
gp.gcworkdone = false // set to true in gcphasework gp.gcscandone = false // set to true in gcphasework
gp.gcscanvalid = false // stack has not been scanned gp.gcscanvalid = false // stack has not been scanned
gp.gcalloc = 0 gp.gcalloc = 0
gp.gcscanwork = 0 gp.gcscanwork = 0
......
...@@ -40,7 +40,7 @@ func gcscan_m() { ...@@ -40,7 +40,7 @@ func gcscan_m() {
// Check that gc work is done. // Check that gc work is done.
for i := 0; i < local_allglen; i++ { for i := 0; i < local_allglen; i++ {
gp := allgs[i] gp := allgs[i]
if !gp.gcworkdone { if !gp.gcscandone {
throw("scan missed a g") throw("scan missed a g")
} }
} }
...@@ -130,35 +130,8 @@ func markroot(desc *parfor, i uint32) { ...@@ -130,35 +130,8 @@ func markroot(desc *parfor, i uint32) {
// non-STW phases. // non-STW phases.
shrinkstack(gp) shrinkstack(gp)
} }
if readgstatus(gp) == _Gdead {
gp.gcworkdone = true
} else {
gp.gcworkdone = false
}
restart := stopg(gp)
// goroutine will scan its own stack when it stops running. scang(gp)
// Wait until it has.
for readgstatus(gp) == _Grunning && !gp.gcworkdone {
}
// scanstack(gp) is done as part of gcphasework
// But to make sure we finished we need to make sure that
// the stack traps have all responded so drop into
// this while loop until they respond.
for !gp.gcworkdone {
status = readgstatus(gp)
if status == _Gdead {
gp.gcworkdone = true // scan is a noop
break
}
if status == _Gwaiting || status == _Grunnable {
restart = stopg(gp)
}
}
if restart {
restartg(gp)
}
} }
gcw.dispose() gcw.dispose()
...@@ -254,32 +227,6 @@ func gcAssistAlloc(size uintptr, allowAssist bool) { ...@@ -254,32 +227,6 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
}) })
} }
// The gp has been moved to a GC safepoint. GC phase specific
// work is done here.
//go:nowritebarrier
func gcphasework(gp *g) {
if gp.gcworkdone {
return
}
switch gcphase {
default:
throw("gcphasework in bad gcphase")
case _GCoff, _GCstw, _GCsweep:
// No work.
case _GCscan:
// scan the stack, mark the objects, put pointers in work buffers
// hanging off the P where this is being run.
// Indicate that the scan is valid until the goroutine runs again
scanstack(gp)
case _GCmark:
// No work.
case _GCmarktermination:
scanstack(gp)
// All available mark work will be emptied before returning.
}
gp.gcworkdone = true
}
//go:nowritebarrier //go:nowritebarrier
func scanstack(gp *g) { func scanstack(gp *g) {
if gp.gcscanvalid { if gp.gcscanvalid {
......
...@@ -359,67 +359,76 @@ func casgcopystack(gp *g) uint32 { ...@@ -359,67 +359,76 @@ func casgcopystack(gp *g) uint32 {
} }
} }
// stopg ensures that gp is stopped at a GC safe point where its stack can be scanned // scang blocks until gp's stack has been scanned.
// or in the context of a moving collector the pointers can be flipped from pointing // It might be scanned by scang or it might be scanned by the goroutine itself.
// to old object to pointing to new objects. // Either way, the stack scan has completed when scang returns.
// If stopg returns true, the caller knows gp is at a GC safe point and will remain there until func scang(gp *g) {
// the caller calls restartg. // Invariant; we (the caller, markroot for a specific goroutine) own gp.gcscandone.
// If stopg returns false, the caller is not responsible for calling restartg. This can happen // Nothing is racing with us now, but gcscandone might be set to true left over
// if another thread, either the gp itself or another GC thread is taking the responsibility // from an earlier round of stack scanning (we scan twice per GC).
// to do the GC work related to this thread. // We use gcscandone to record whether the scan has been done during this round.
func stopg(gp *g) bool { // It is important that the scan happens exactly once: if called twice,
for { // the installation of stack barriers will detect the double scan and die.
if gp.gcworkdone {
return false gp.gcscandone = false
}
// Endeavor to get gcscandone set to true,
// either by doing the stack scan ourselves or by coercing gp to scan itself.
// gp.gcscandone can transition from false to true when we're not looking
// (if we asked for preemption), so any time we lock the status using
// castogscanstatus we have to double-check that the scan is still not done.
for !gp.gcscandone {
switch s := readgstatus(gp); s { switch s := readgstatus(gp); s {
default: default:
dumpgstatus(gp) dumpgstatus(gp)
throw("stopg: gp->atomicstatus is not valid") throw("stopg: invalid status")
case _Gdead: case _Gdead:
return false // No stack.
gp.gcscandone = true
case _Gcopystack: case _Gcopystack:
// Loop until a new stack is in place. // Stack being switched. Go around again.
case _Grunnable, case _Grunnable, _Gsyscall, _Gwaiting:
_Gsyscall,
_Gwaiting:
// Claim goroutine by setting scan bit. // Claim goroutine by setting scan bit.
if !castogscanstatus(gp, s, s|_Gscan) { // Racing with execution or readying of gp.
break // The scan bit keeps them from running
// the goroutine until we're done.
if castogscanstatus(gp, s, s|_Gscan) {
if !gp.gcscandone {
scanstack(gp)
gp.gcscandone = true
}
restartg(gp)
} }
// In scan state, do work.
gcphasework(gp)
return true
case _Gscanrunnable, case _Gscanwaiting:
_Gscanwaiting, // newstack is doing a scan for us right now. Wait.
_Gscansyscall:
// Goroutine already claimed by another GC helper.
return false
case _Grunning: case _Grunning:
// Claim goroutine, so we aren't racing with a status // Goroutine running. Try to preempt execution so it can scan itself.
// transition away from Grunning. // The preemption handler (in newstack) does the actual scan.
if !castogscanstatus(gp, _Grunning, _Gscanrunning) {
// Optimization: if there is already a pending preemption request
// (from the previous loop iteration), don't bother with the atomics.
if gp.preemptscan && gp.preempt && gp.stackguard0 == stackPreempt {
break break
} }
// Mark gp for preemption. // Ask for preemption and self scan.
if !gp.gcworkdone { if castogscanstatus(gp, _Grunning, _Gscanrunning) {
if !gp.gcscandone {
gp.preemptscan = true gp.preemptscan = true
gp.preempt = true gp.preempt = true
gp.stackguard0 = stackPreempt gp.stackguard0 = stackPreempt
} }
// Unclaim.
casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning) casfrom_Gscanstatus(gp, _Gscanrunning, _Grunning)
return false
} }
} }
}
gp.preemptscan = false // cancel scan request if no longer needed
} }
// The GC requests that this routine be moved from a scanmumble state to a mumble state. // The GC requests that this routine be moved from a scanmumble state to a mumble state.
...@@ -451,20 +460,6 @@ func restartg(gp *g) { ...@@ -451,20 +460,6 @@ func restartg(gp *g) {
} }
} }
func stopscanstart(gp *g) {
_g_ := getg()
if _g_ == gp {
throw("GC not moved to G0")
}
if stopg(gp) {
if !isscanstatus(readgstatus(gp)) {
dumpgstatus(gp)
throw("GC not in scan state")
}
restartg(gp)
}
}
// stopTheWorld stops all P's from executing goroutines, interrupting // stopTheWorld stops all P's from executing goroutines, interrupting
// all goroutines at GC safe points and records reason as the reason // all goroutines at GC safe points and records reason as the reason
// for the stop. On return, only the current goroutine's P is running. // for the stop. On return, only the current goroutine's P is running.
......
...@@ -237,7 +237,7 @@ type g struct { ...@@ -237,7 +237,7 @@ type g struct {
preempt bool // preemption signal, duplicates stackguard0 = stackpreempt preempt bool // preemption signal, duplicates stackguard0 = stackpreempt
paniconfault bool // panic (instead of crash) on unexpected fault address paniconfault bool // panic (instead of crash) on unexpected fault address
preemptscan bool // preempted g does scan for gc preemptscan bool // preempted g does scan for gc
gcworkdone bool // debug: cleared at beginning of gc work phase cycle, set by gcphasework, tested at end of cycle gcscandone bool // g has scanned stack; protected by _Gscan bit in status
gcscanvalid bool // false at start of gc cycle, true if G has not run since last scan gcscanvalid bool // false at start of gc cycle, true if G has not run since last scan
throwsplit bool // must not split stack throwsplit bool // must not split stack
raceignore int8 // ignore race detection events raceignore int8 // ignore race detection events
......
...@@ -756,12 +756,15 @@ func newstack() { ...@@ -756,12 +756,15 @@ func newstack() {
// be set and gcphasework will simply // be set and gcphasework will simply
// return. // return.
} }
gcphasework(gp) if !gp.gcscandone {
scanstack(gp)
gp.gcscandone = true
}
gp.preemptscan = false
gp.preempt = false
casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting) casfrom_Gscanstatus(gp, _Gscanwaiting, _Gwaiting)
casgstatus(gp, _Gwaiting, _Grunning) casgstatus(gp, _Gwaiting, _Grunning)
gp.stackguard0 = gp.stack.lo + _StackGuard gp.stackguard0 = gp.stack.lo + _StackGuard
gp.preempt = false
gp.preemptscan = false // Tells the GC premption was successful.
gogo(&gp.sched) // never return gogo(&gp.sched) // never return
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment