.

7aa13998 · Kirill Smelkov · 06685e0b · 7aa13998 · 7aa13998
Commit 7aa13998 authored Feb 18, 2021 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 40 additions and 23 deletions

go/neo/master.go go/neo/master.go +39 -22

go/neo/t_cluster_test.go go/neo/t_cluster_test.go +1 -1

No files found.
--- a/go/neo/master.go
+++ b/go/neo/master.go
@@ -181,7 +181,7 @@ func NewMaster(clusterName string, net xnet.Networker) *Master {
 // NOTE upon successful return cluster is not yet in running state - the transition will
 // take time and could be also automatically aborted due to cluster environment change (e.g.
 // a storage node goes down).
-func (m *Master) _Start() error {
+func (m *Master) Start() error {
 	ech := make(chan error)
 	m.ctlStart <- ech
 	return <-ech
@@ -392,7 +392,19 @@ func (m *Master) recovery(ctx context.Context) (err error) {
 	inprogress := 0                       // in-progress stor recoveries
 //	wg := &sync.WaitGroup{}
-	start := false // whether we were instructed to start
+	// requests to .ctlStart received when readyToStart
+	// on success answered when full recovery completes
+	startReqv := []chan error{}
+	errStartNonOperational := fmt.Errorf("start: cluster is non-operational")
+	defer func() {
+		errStart := errStartNonOperational
+		if err == nil {
+			errStart = nil
+		}
+		for _, ech := range startReqv {
+			ech <- errStart
+		}
+	}()
 //trace:event traceMasterStartReady(m *Master, ready bool)
 	readyToStart := false // whether cluster currently can be operational or not
 	updateReadyToStart := func() {
@@ -409,12 +421,26 @@ func (m *Master) recovery(ctx context.Context) (err error) {
 			ready = (nup > 0 && inprogress == 0)
 		} else {
-			ready = m.node.State.PartTab.OperationalWith(m.node.State.NodeTab)	// XXX + node state
+			ready = m.node.State.PartTab.OperationalWith(m.node.State.NodeTab)
 		}
 		if readyToStart != ready {
+			state := "ready"
+			if !ready {
+				state = "not ready"
+			}
+			log.Info(ctx, "cluster is %s to start", state)
 			readyToStart = ready
 			traceMasterStartReady(m, ready)
+			// cluster became non-operational - cancel previously queued start requests
+			if !ready {
+				for _, ech := range startReqv {
+					ech <- errStartNonOperational
+				}
+				startReqv = startReqv[:0]
+			}
 		}
 	}
@@ -473,9 +499,9 @@ func (m *Master) recovery(ctx context.Context) (err error) {
 		ctlStop   = nil
 		nodeComeq = nil
 	}
-loop:
 	for inprogress > 0 || !(
-		/*start*/(readyToStart && start) || /*stop*/(err != nil)) {
+		/*start*/(readyToStart && len(startReqv) > 0) || /*stop*/(err != nil)) {
 		select {
 		case <-ctxDone:
@@ -486,25 +512,16 @@ loop:
 		case ech := <-ctlStart:
 			if readyToStart {
 				log.Infof(ctx, "start command - we are ready")
-				// reply "ok to start" after whole recovery finishes
+				// queue start request. Right now we believe we can
+				// satisfy it, but during completion of spawned recovery
-				// XXX ok? we want to retrieve all recovery information first?
+				// tasks, the cluster might become non-operational again.
-				// XXX or initially S is in PENDING state and
+				// If it will - queued start requests will be canceled.
-				// transitions to RUNNING only after successful recovery?
+				startReqv = append(startReqv, ech)
+			} else {
-				rcancel()
+				log.Infof(ctx, "start command - err - we are not ready")
-				defer func() {
+				ech <- errStartNonOperational
-					// XXX can situation change while we are shutting down?
-					// XXX -> recheck logic with checking PT operational ^^^
-					// XXX    (depending on storages state)
-					ech <- nil
-				}()
-				break loop // FIXME
 			}
-			log.Infof(ctx, "start command - err - we are not ready")
-			ech <- fmt.Errorf("start: cluster is non-operational")
 		case ech := <-ctlStop:
 			close(ech) // ok; we are already recovering

--- a/go/neo/t_cluster_test.go
+++ b/go/neo/t_cluster_test.go
@@ -83,7 +83,7 @@ type tNode struct {
 // ITestMaster represents tested master node.
 type ITestMaster interface {
-	_Start() error
+	Start() error
 }
 // ITestStorage represents tested storage node.