Commit 71f7d77e authored by Kirill Smelkov's avatar Kirill Smelkov

.

parent 242c3d85
...@@ -53,6 +53,8 @@ type Master struct { ...@@ -53,6 +53,8 @@ type Master struct {
ctlStop chan ctlStop // request to stop cluster ctlStop chan ctlStop // request to stop cluster
ctlShutdown chan chan error // request to shutdown cluster XXX with ctx too ? ctlShutdown chan chan error // request to shutdown cluster XXX with ctx too ?
wantToStart chan chan error // main -> recovery
// channels from various workers to main driver // channels from various workers to main driver
nodeCome chan nodeCome // node connected nodeCome chan nodeCome // node connected
nodeLeave chan nodeLeave // node disconnected nodeLeave chan nodeLeave // node disconnected
...@@ -95,23 +97,68 @@ func NewMaster(clusterName string) *Master { ...@@ -95,23 +97,68 @@ func NewMaster(clusterName string) *Master {
// XXX NotifyNodeInformation to all nodes whenever nodetab changes // XXX NotifyNodeInformation to all nodes whenever nodetab changes
// XXX -> Start(), Stop() // XXX -> Start(), Stop()
/*
func (m *Master) SetClusterState(state ClusterState) error { // Start requests cluster to eventually transition into running state
// it returns an error if such transition is not currently possible (e.g. partition table is not operational)
// it returns nil if the transition began.
// NOTE upon successfull return cluster is not yet in running state - the transition will
// take time and could be also automatically aborted due to cluster environment change (e.g.
// a storage node goes down)
func (m *Master) Start() error {
ch := make(chan error) ch := make(chan error)
m.ctlState <- ctlState{state, ch} m.ctlStart <- ctlStart{ch}
return <-ch return <-ch
} }
*/
// run implements main master cluster management logic: node tracking, cluster XXX -> only top-level // Stop requests cluster to eventually transition into recovery state
// XXX should be always possible ?
func (m *Master) Stop() error {
ch := make(chan error)
m.ctlStop <- ctlStop{ch}
return <-ch
}
// Shutdown requests all known nodes in the cluster to stop
func (m *Master) Shutdown() error {
panic("TODO")
}
func (m *Master) setClusterState(state ClusterState) {
m.clusterState = state
// TODO notify subscribers
}
func (m *Master) xxx(ctx ...) {
var err error
for ctx.Err() == nil {
err = recovery(ctx)
if err != nil {
return // XXX
}
// successful recovery -> verify
err = verify(ctx)
if err != nil {
continue // -> recovery
}
// successful verify -> service
err = service(ctx)
if err != nil {
// XXX what about shutdown ?
continue // -> recovery
}
}
}
// run implements main master cluster management logic: node tracking, cluster
// state updates, scheduling data movement between storage nodes etc // state updates, scheduling data movement between storage nodes etc
func (m *Master) run(ctx context.Context) { func (m *Master) run(ctx context.Context) {
// // current function to ask/control a storage depending on current cluster state go m.recovery(ctx)
// // + associated context covering all storage nodes
// // XXX + waitgroup ?
// storCtl := m.storCtlRecovery
// storCtlCtx, storCtlCancel := context.WithCancel(ctx)
for { for {
select { select {
...@@ -128,18 +175,24 @@ func (m *Master) run(ctx context.Context) { ...@@ -128,18 +175,24 @@ func (m *Master) run(ctx context.Context) {
break break
} }
// check preconditions for start ch := make(chan error)
if !m.partTab.OperationalWith(&m.nodeTab) { select {
// XXX err ctx case <-ctx.Done():
// TODO + how much % PT is covered // XXX how to avoid checking this ctx.Done everywhere?
c.resp <- fmt.Errorf("start: non-operational partition table") c.resp <- ctx.Err()
panic("TODO")
case m.wantToStart <- ch:
}
err := <-ch
c.resp <- err
if err != nil {
break break
} }
// XXX cancel/stop/wait for current recovery task // recovery said it is ok to start and finished - launch verification
// XXX start starting task m.setClusterState(ClusterVerifying)
go m.verify(ctx)
// command to stop cluster // command to stop cluster
case <-m.ctlStop: case <-m.ctlStop:
...@@ -147,40 +200,7 @@ func (m *Master) run(ctx context.Context) { ...@@ -147,40 +200,7 @@ func (m *Master) run(ctx context.Context) {
// command to shutdown // command to shutdown
case <-m.ctlShutdown: case <-m.ctlShutdown:
/*
// node connects & requests identification
case n := <-m.nodeCome:
nodeInfo, ok := m.accept(n)
if !(ok && nodeInfo.NodeType == STORAGE) {
break
}
// new storage node joined cluster
// XXX consider .clusterState change
// launch current storage control work on the joined node
go storCtl(storCtlCtx, n.link)
// TODO consider adjusting partTab
// node disconnects
case _ = <-m.nodeLeave:
// TODO // TODO
// a storage node came through recovery - let's see whether
// ptid ↑ and if so we should take partition table from there
case r := <-m.storRecovery:
if r.partTab.ptid > m.partTab.ptid {
m.partTab = r.partTab
// XXX also transfer subscribers ?
// XXX -> during recovery no one must be subscribed to partTab
}
// XXX consider clusterState change
*/
} }
} }
...@@ -205,7 +225,7 @@ func (m *Master) recovery(ctx context.Context) { ...@@ -205,7 +225,7 @@ func (m *Master) recovery(ctx context.Context) {
defer rcancel() defer rcancel()
inprogress := 0 inprogress := 0
// start recovery on all storages we are currently in touch // start recovery on all storages we are currently in touch with
for _, stor := range m.nodeTab.StorageList() { for _, stor := range m.nodeTab.StorageList() {
if stor.Info.NodeState > DOWN { // XXX state cmp ok ? if stor.Info.NodeState > DOWN { // XXX state cmp ok ?
inprogress++ inprogress++
...@@ -216,10 +236,6 @@ func (m *Master) recovery(ctx context.Context) { ...@@ -216,10 +236,6 @@ func (m *Master) recovery(ctx context.Context) {
loop: loop:
for { for {
select { select {
case <-ctx.Done():
// XXX
break loop
case n := <-m.nodeCome: case n := <-m.nodeCome:
node, ok := m.accept(n, /* XXX do not accept clients */) node, ok := m.accept(n, /* XXX do not accept clients */)
if !ok { if !ok {
...@@ -234,7 +250,8 @@ loop: ...@@ -234,7 +250,8 @@ loop:
m.nodeTab.UpdateLinkDown(n.link) m.nodeTab.UpdateLinkDown(n.link)
// XXX update something indicating cluster currently can be operational or not ? // XXX update something indicating cluster currently can be operational or not ?
// result of a storage recovery // a storage node came through recovery - let's see whether
// ptid ↑ and if so we should take partition table from there
case r := <-recovery: case r := <-recovery:
inprogress-- inprogress--
...@@ -254,9 +271,11 @@ loop: ...@@ -254,9 +271,11 @@ loop:
// XXX update something indicating cluster currently can be operational or not ? // XXX update something indicating cluster currently can be operational or not ?
// request from master: request "ok to start?" - if ok we reply ok and exit
// request from master: "I want to start - ok?" - if ok we reply ok and exit
// if not ok - we just reply not ok // if not ok - we just reply not ok
case s := <-...: //case s := <-m.wantToStart:
case c := <-m.ctlStart:
if m.partTab.OperationalWith(&m.nodeTab) { if m.partTab.OperationalWith(&m.nodeTab) {
// reply "ok to start" after whole recovery finishes // reply "ok to start" after whole recovery finishes
...@@ -267,12 +286,19 @@ loop: ...@@ -267,12 +286,19 @@ loop:
rcancel() rcancel()
defer func() { defer func() {
s.resp <- nil s <- nil
}() }()
break loop break loop
} }
s.resp <- fmt.Errorf("cluster is non-operational") s <- fmt.Errorf("start: cluster is non-operational")
case c := <-m.ctlStop:
c.resp <- nil // we are already recovering
case <-ctx.Done():
// XXX
break loop
} }
} }
...@@ -363,9 +389,7 @@ func storCtlRecovery(ctx context.Context, link *NodeLink, res chan storRecovery) ...@@ -363,9 +389,7 @@ func storCtlRecovery(ctx context.Context, link *NodeLink, res chan storRecovery)
// verify is a process that drives cluster via verification phase // verify is a process that drives cluster via verification phase
// //
// prerequisite for start: .partTab is operational wrt .nodeTab // prerequisite for start: .partTab is operational wrt .nodeTab
// func (m *Master) verify(ctx context.Context) error { //, storv []*NodeLink) error {
// XXX draft: Cluster Verify if []Stor is fixed
func (m *Master) verify(ctx context.Context, storv []*NodeLink) error {
// XXX ask every storage for verify and wait for _all_ them to complete? // XXX ask every storage for verify and wait for _all_ them to complete?
var err error var err error
...@@ -376,17 +400,20 @@ func (m *Master) verify(ctx context.Context, storv []*NodeLink) error { ...@@ -376,17 +400,20 @@ func (m *Master) verify(ctx context.Context, storv []*NodeLink) error {
// XXX do we need to reset m.lastOid / m.lastTid to 0 in the beginning? // XXX do we need to reset m.lastOid / m.lastTid to 0 in the beginning?
for _, stor := range storv { // start verification on all storages we are currently in touch with
for _, stor := range m.nodeTab.StorageList() {
inprogress++ inprogress++
go storCtlVerify(vctx, stor, verify) go storCtlVerify(vctx, stor.Link, verify)
} }
loop: loop:
for inprogress > 0 { for inprogress > 0 {
select { select {
case <-ctx.Done(): case n := <-m.nodeCome:
err = ctx.Err() // TODO
break loop
case n := <-m.nodeLeave:
// TODO
case v := <-verify: case v := <-verify:
inprogress-- inprogress--
...@@ -410,6 +437,19 @@ loop: ...@@ -410,6 +437,19 @@ loop:
m.lastTid = v.lastTid m.lastTid = v.lastTid
} }
} }
case c := <-m.ctlStart:
c.resp <- nil // we are already starting
case c := <-m.ctlStop:
c.resp <- nil // ok
err = fmt.Errorf("stop requested")
break loop
case <-ctx.Done():
err = ctx.Err()
break loop
} }
} }
...@@ -484,14 +524,39 @@ func storCtlVerify(ctx context.Context, link *NodeLink, res chan storVerify) { ...@@ -484,14 +524,39 @@ func storCtlVerify(ctx context.Context, link *NodeLink, res chan storVerify) {
// service is the process that drives cluster during running state // service is the process that drives cluster during running state
// //
// XXX draft: Cluster Running if []Stor is fixed func (m *Master) service(ctx context.Context) {
func (m *Master) service(ctx context.Context, storv []*NodeLink) {
loop:
for {
select {
case n := <-m.nodeCome:
// TODO // TODO
}
// XXX draft: Cluster Stopping if []Stor is fixed case n := <-m.nodeLeave:
func (m *Master) stop(ctx context.Context, storv []*NodeLink) {
// TODO // TODO
// XXX what else ? (-> txn control at least)
case c := <-m.ctlStart:
c.resp <- nil // we are already started
case c := <-m.ctlStop:
c.resp <- nil // ok
err = fmt.Errorf("stop requested")
break loop
case <-ctx.Done():
err = ctx.Err()
break loop
}
}
if err != nil {
// TODO
}
return err
} }
// accept processes identification request of just connected node and either accepts or declines it // accept processes identification request of just connected node and either accepts or declines it
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment