Commit 7aa13998 authored by Kirill Smelkov's avatar Kirill Smelkov

.

parent 06685e0b
......@@ -181,7 +181,7 @@ func NewMaster(clusterName string, net xnet.Networker) *Master {
// NOTE upon successful return cluster is not yet in running state - the transition will
// take time and could be also automatically aborted due to cluster environment change (e.g.
// a storage node goes down).
func (m *Master) _Start() error {
func (m *Master) Start() error {
ech := make(chan error)
m.ctlStart <- ech
return <-ech
......@@ -392,7 +392,19 @@ func (m *Master) recovery(ctx context.Context) (err error) {
inprogress := 0 // in-progress stor recoveries
// wg := &sync.WaitGroup{}
start := false // whether we were instructed to start
// requests to .ctlStart received when readyToStart
// on success answered when full recovery completes
startReqv := []chan error{}
errStartNonOperational := fmt.Errorf("start: cluster is non-operational")
defer func() {
errStart := errStartNonOperational
if err == nil {
errStart = nil
}
for _, ech := range startReqv {
ech <- errStart
}
}()
//trace:event traceMasterStartReady(m *Master, ready bool)
readyToStart := false // whether cluster currently can be operational or not
updateReadyToStart := func() {
......@@ -409,12 +421,26 @@ func (m *Master) recovery(ctx context.Context) (err error) {
ready = (nup > 0 && inprogress == 0)
} else {
ready = m.node.State.PartTab.OperationalWith(m.node.State.NodeTab) // XXX + node state
ready = m.node.State.PartTab.OperationalWith(m.node.State.NodeTab)
}
if readyToStart != ready {
state := "ready"
if !ready {
state = "not ready"
}
log.Info(ctx, "cluster is %s to start", state)
readyToStart = ready
traceMasterStartReady(m, ready)
// cluster became non-operational - cancel previously queued start requests
if !ready {
for _, ech := range startReqv {
ech <- errStartNonOperational
}
startReqv = startReqv[:0]
}
}
}
......@@ -473,9 +499,9 @@ func (m *Master) recovery(ctx context.Context) (err error) {
ctlStop = nil
nodeComeq = nil
}
loop:
for inprogress > 0 || !(
/*start*/(readyToStart && start) || /*stop*/(err != nil)) {
/*start*/(readyToStart && len(startReqv) > 0) || /*stop*/(err != nil)) {
select {
case <-ctxDone:
......@@ -486,24 +512,15 @@ loop:
case ech := <-ctlStart:
if readyToStart {
log.Infof(ctx, "start command - we are ready")
// reply "ok to start" after whole recovery finishes
// XXX ok? we want to retrieve all recovery information first?
// XXX or initially S is in PENDING state and
// transitions to RUNNING only after successful recovery?
rcancel()
defer func() {
// XXX can situation change while we are shutting down?
// XXX -> recheck logic with checking PT operational ^^^
// XXX (depending on storages state)
ech <- nil
}()
break loop // FIXME
}
// queue start request. Right now we believe we can
// satisfy it, but during completion of spawned recovery
// tasks, the cluster might become non-operational again.
// If it will - queued start requests will be canceled.
startReqv = append(startReqv, ech)
} else {
log.Infof(ctx, "start command - err - we are not ready")
ech <- fmt.Errorf("start: cluster is non-operational")
ech <- errStartNonOperational
}
case ech := <-ctlStop:
close(ech) // ok; we are already recovering
......
......@@ -83,7 +83,7 @@ type tNode struct {
// ITestMaster represents tested master node.
type ITestMaster interface {
_Start() error
Start() error
}
// ITestStorage represents tested storage node.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment