Commit dbf5a97b authored by Kirill Smelkov's avatar Kirill Smelkov

go/zodb/internal/weak: Try to fix GC crashes via reworking Ref to keep only one word instead of two

We are observing garbage-collector crashes due to weak package under
load(*) with GC crashing as e.g.

    runtime: full=0xc0001f10000005 next=205 jobs=204 nDataRoots=1 nBSSRoots=1 nSpanRoots=16 nStackRoots=184
    panic: non-empty mark queue after concurrent mark
    fatal error: panic on system stack

    runtime stack:
    runtime.throw({0x5c60fe?, 0x601d70?})
            /home/kirr/src/tools/go/go1.21/src/runtime/panic.go:1077 +0x5c fp=0xc000051e88 sp=0xc000051e58 pc=0x436efc
    panic({0x585100?, 0x601d70?})
            /home/kirr/src/tools/go/go1.21/src/runtime/panic.go:840 +0x6ea fp=0xc000051f38 sp=0xc000051e88 pc=0x436e0a
    runtime.gcMark(0x118946?)
            /home/kirr/src/tools/go/go1.21/src/runtime/mgc.go:1464 +0x40c fp=0xc000051fb0 sp=0xc000051f38 pc=0x41bd6c
    runtime.gcMarkTermination.func1()
            /home/kirr/src/tools/go/go1.21/src/runtime/mgc.go:962 +0x17 fp=0xc000051fc8 sp=0xc000051fb0 pc=0x41b277
    runtime.systemstack()
            /home/kirr/src/tools/go/go1.21/src/runtime/asm_amd64.s:509 +0x4a fp=0xc000051fd8 sp=0xc000051fc8 pc=0x468eea

One problem in current implementation is that weak.Ref keeps two words
for the copy of original interface object and recreates that interface
object on Ref.Get from those two words _nonatomically_. Which is
explicitly documented by Go memory model as something that can lead to
corrupted memory and crashes. From https://go.dev/ref/mem#restrictions:

    This means that races on multiword data structures can lead to
    inconsistent values not corresponding to a single write. When the values
    depend on the consistency of internal (pointer, length) or (pointer,
    type) pairs, as can be the case for interface values, maps, slices, and
    strings in most Go implementations, such races can in turn lead to
    arbitrary memory corruption.

We can avoid doing multiword writes during object resurrection by using
concrete type T* instead of interface{}.

Unfortunately as wendelin.core@9b44fc23
shows it does not help with the above issue and the GC continues to
crash with the same "panic: non-empty mark queue after concurrent mark"
message. This is because weak.Ref implementation needs tight integration
with concurrent GC that Go does and in practice we are unable to do that
from outside of Go runtime internals. See e.g.
https://github.com/golang/go/commit/dfc86e922cd0 and
https://github.com/golang/go/commit/79fd633632cd to get an idea what
kind of integration that needs to be.

Anyway before disabling weak references support I wanted to commit this
change to show that one-word approach was tried and it does not work.
This leaves a trace in the history.

On the good side we are going to use standard package weak in the future
hopefully (https://github.com/golang/go/issues/67552). That needs to
wait for at least Go 1.24 though.

(*) see https://github.com/golang/go/issues/41303 for details
parent 12a6a923
// Copyright (C) 2018-2021 Nexedi SA and Contributors.
// Copyright (C) 2018-2024 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
......@@ -118,7 +118,7 @@ type LiveCache struct {
//
// NOTE2 finalizers don't run on when they are attached to an object in cycle.
// Hopefully we don't have cycles with BTree/Bucket.
objtab map[Oid]*weak.Ref // oid -> weak.Ref(IPersistent)
objtab map[Oid]*weak.Ref[Persistent]
// hooks for application to influence live caching decisions.
control LiveCacheControl
......@@ -185,7 +185,7 @@ func newConnection(db *DB, at Tid) *Connection {
at: at,
cache: LiveCache{
pinned: make(map[Oid]IPersistent),
objtab: make(map[Oid]*weak.Ref),
objtab: make(map[Oid]*weak.Ref[Persistent]),
},
}
if cc := db.opt.CacheControl; cc != nil {
......@@ -236,7 +236,7 @@ func (cache *LiveCache) Get(oid Oid) IPersistent {
wobj := cache.objtab[oid]
if wobj != nil {
if xobj := wobj.Get(); xobj != nil {
obj = xobj.(IPersistent)
obj = xobj.instance
}
}
......@@ -257,7 +257,7 @@ func (cache *LiveCache) setNew(oid Oid, obj IPersistent) {
cache.pinned[oid] = obj
// XXX assert .objtab[oid] == nil ?
} else {
cache.objtab[oid] = weak.NewRef(obj)
cache.objtab[oid] = weak.NewRef(obj.persistent())
// XXX assert .pinned[oid] == nil ?
}
}
......@@ -269,7 +269,7 @@ func (cache *LiveCache) forEach(f func(IPersistent)) {
}
for _, wobj := range cache.objtab {
if xobj := wobj.Get(); xobj != nil {
f(xobj.(IPersistent))
f(xobj.instance)
}
}
}
......@@ -285,7 +285,7 @@ func (cache *LiveCache) SetControl(c LiveCacheControl) {
// reclassify all objects
c2 := *cache
cache.objtab = make(map[Oid]*weak.Ref)
cache.objtab = make(map[Oid]*weak.Ref[Persistent])
cache.pinned = make(map[Oid]IPersistent)
c2.forEach(func(obj IPersistent) {
cache.setNew(obj.POid(), obj)
......
// Copyright (C) 2018 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
// Copyright (C) 2018-2024 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// based on:
// initially based on:
// https://groups.google.com/d/msg/golang-nuts/PYWxjT2v6ps/dL71oJk1mXEJ
// https://play.golang.org/p/f9HY6-z8Pp
//
// see the following bug for intricate troubles of original implementation:
// https://github.com/golang/go/issues/41303
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
......@@ -22,6 +25,9 @@
// See https://www.nexedi.com/licensing for rationale and options.
// Package weak provides weak references for Go.
//
// Note: this package should be superceeded with std package weak when
// proposal for that gets implemented: https://github.com/golang/go/issues/67552.
package weak
import (
......@@ -30,13 +36,6 @@ import (
"unsafe"
)
// iface is how Go runtime represents an interface.
//
// NOTE layout must be synchronized to Go runtime representation.
type iface struct {
typ uintptr // type
data uintptr // data
}
// weakRefState represents current state of an object Ref points to.
type weakRefState int32
......@@ -48,15 +47,15 @@ const (
)
// Ref is a weak reference.
// Ref[T] is a weak reference.
//
// Create one with NewRef and retrieve referenced object with Get.
//
// There must be no more than 1 weak reference to any object.
// Weak references must not be attached to an object on which runtime.SetFinalizer is also used.
// Weak references must not be copied.
type Ref struct {
iface
type Ref[T any] struct {
iptr uintptr
// XXX try to do without mutex and only with atomics
mu sync.Mutex
......@@ -66,17 +65,17 @@ type Ref struct {
// NewRef creates new weak reference pointing to obj.
//
// TODO + onrelease callback?
func NewRef(obj interface{}) *Ref {
func NewRef[T any](obj *T) *Ref[T] {
// since starting from ~ Go1.4 the GC is precise, we can save interface
// pointers to uintptr and that won't prevent GC from garbage
// collecting the object.
w := &Ref{
iface: *(*iface)(unsafe.Pointer(&obj)),
w := &Ref[T]{
iptr: (uintptr)(unsafe.Pointer(obj)),
state: objLive,
}
var release func(interface{})
release = func(obj interface{}) {
var release func(*T)
release = func(obj *T) {
// GC decided that the object is no longer reachable and
// scheduled us to run as finalizer. During the time till we
// actually run, Ref.Get might have been come to run and
......@@ -101,15 +100,13 @@ func NewRef(obj interface{}) *Ref {
//
// If original object is still alive - it is returned.
// If not - nil is returned.
func (w *Ref) Get() (obj interface{}) {
func (w *Ref[T]) Get() (obj *T) {
w.mu.Lock()
if w.state != objReleased {
w.state = objGot
// recreate interface{} from saved words.
// XXX do writes as pointers so that compiler emits write barriers to notify GC?
i := (*iface)(unsafe.Pointer(&obj))
*i = w.iface
// recreate pointer from saved word.
obj = (*T)(unsafe.Pointer(w.iptr))
}
w.mu.Unlock()
return obj
......
// Copyright (C) 2018 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
// Copyright (C) 2018-2024 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
......@@ -26,33 +26,6 @@ import (
"unsafe"
)
// verify that interface <-> iface works ok.
func TestIface(t *testing.T) {
var i interface{}
var fi *iface
isize := unsafe.Sizeof(i)
fsize := unsafe.Sizeof(*fi)
if isize != fsize {
t.Fatalf("sizeof(interface{}) (%d) != sizeof(iface) (%d)", isize, fsize)
}
i = 3
var j interface{}
if !(j == nil && i != j) {
t.Fatalf("i == j ? (i: %#v, j: %#v}", i, j)
}
fi = (*iface)(unsafe.Pointer(&i))
fj := (*iface)(unsafe.Pointer(&j))
*fj = *fi
if i != j {
t.Fatalf("i (%#v) != j (%#v)", i, j)
}
}
func TestWeakRef(t *testing.T) {
type T struct{ _ [8]int64 } // large enough not to go into tinyalloc
......@@ -87,13 +60,9 @@ func TestWeakRef(t *testing.T) {
p = nil
GC()
assertEq(w.state, objLive) // fin ran and downgraded got -> live
switch p_ := w.Get().(type) {
default:
t.Fatalf("Get after objGot -> objLive: %#v", p_)
case *T:
if uintptr(unsafe.Pointer(p_)) != pptr {
t.Fatal("Get after objGot -> objLive: T, but ptr is not the same")
}
p_ := w.Get()
if uintptr(unsafe.Pointer(p_)) != pptr {
t.Fatal("Get after objGot -> objLive: ptr is not the same")
}
assertEq(w.state, objGot)
......@@ -102,5 +71,5 @@ func TestWeakRef(t *testing.T) {
GC()
assertEq(w.state, objReleased) // fin ran again and released the object
assertEq(w.Get(), nil)
assertEq(w.Get(), (*T)(nil))
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment