Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
neo
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
2
Merge Requests
2
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Jobs
Commits
Open sidebar
Kirill Smelkov
neo
Commits
06685e0b
Commit
06685e0b
authored
Feb 17, 2021
by
Kirill Smelkov
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
.
parent
b5b0419e
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
50 additions
and
30 deletions
+50
-30
go/neo/master.go
go/neo/master.go
+49
-29
go/neo/t_cluster_test.go
go/neo/t_cluster_test.go
+1
-1
No files found.
go/neo/master.go
View file @
06685e0b
...
@@ -42,9 +42,9 @@ package neo
...
@@ -42,9 +42,9 @@ package neo
import
(
import
(
"context"
"context"
stderrors
"errors"
"errors"
"fmt"
"fmt"
"sync"
//
"sync"
"time"
"time"
xxcontext
"lab.nexedi.com/kirr/go123/xcontext"
xxcontext
"lab.nexedi.com/kirr/go123/xcontext"
...
@@ -181,7 +181,7 @@ func NewMaster(clusterName string, net xnet.Networker) *Master {
...
@@ -181,7 +181,7 @@ func NewMaster(clusterName string, net xnet.Networker) *Master {
// NOTE upon successful return cluster is not yet in running state - the transition will
// NOTE upon successful return cluster is not yet in running state - the transition will
// take time and could be also automatically aborted due to cluster environment change (e.g.
// take time and could be also automatically aborted due to cluster environment change (e.g.
// a storage node goes down).
// a storage node goes down).
func
(
m
*
Master
)
Start
()
error
{
func
(
m
*
Master
)
_
Start
()
error
{
ech
:=
make
(
chan
error
)
ech
:=
make
(
chan
error
)
m
.
ctlStart
<-
ech
m
.
ctlStart
<-
ech
return
<-
ech
return
<-
ech
...
@@ -388,9 +388,9 @@ func (m *Master) recovery(ctx context.Context) (err error) {
...
@@ -388,9 +388,9 @@ func (m *Master) recovery(ctx context.Context) (err error) {
ctx
,
rcancel
:=
context
.
WithCancel
(
ctx
)
ctx
,
rcancel
:=
context
.
WithCancel
(
ctx
)
defer
rcancel
()
defer
rcancel
()
recover
yq
:=
make
(
chan
storRecovery
)
recover
edq
:=
make
(
chan
storRecovery
)
// <- result of 1 stor recovery
inprogress
:=
0
// in-progress stor recoveries
inprogress
:=
0
// in-progress stor recoveries
wg
:=
&
sync
.
WaitGroup
{}
//
wg := &sync.WaitGroup{}
start
:=
false
// whether we were instructed to start
start
:=
false
// whether we were instructed to start
//trace:event traceMasterStartReady(m *Master, ready bool)
//trace:event traceMasterStartReady(m *Master, ready bool)
...
@@ -418,15 +418,15 @@ func (m *Master) recovery(ctx context.Context) (err error) {
...
@@ -418,15 +418,15 @@ func (m *Master) recovery(ctx context.Context) (err error) {
}
}
}
}
//
XXX
set cluster state = RECOVERY
//
TODO (?)
set cluster state = RECOVERY
//
XXX
down clients
//
TODO
down clients
// goStorCtlRecovery spawns recovery task on a storage peer.
// goStorCtlRecovery spawns recovery task on a storage peer.
goStorCtlRecovery
:=
func
(
stor
*
_MasteredPeer
)
{
goStorCtlRecovery
:=
func
(
stor
*
_MasteredPeer
)
{
inprogress
++
inprogress
++
wg
.
Add
(
1
)
//
wg.Add(1)
stor
.
wg
.
Go
(
func
(
peerCtx
context
.
Context
)
error
{
stor
.
wg
.
Go
(
func
(
peerCtx
context
.
Context
)
error
{
defer
wg
.
Done
()
//
defer wg.Done()
ctx
,
cancel
:=
xxcontext
.
Merge
/*Cancel*/
(
ctx
,
peerCtx
)
ctx
,
cancel
:=
xxcontext
.
Merge
/*Cancel*/
(
ctx
,
peerCtx
)
defer
cancel
()
defer
cancel
()
...
@@ -438,7 +438,7 @@ func (m *Master) recovery(ctx context.Context) (err error) {
...
@@ -438,7 +438,7 @@ func (m *Master) recovery(ctx context.Context) (err error) {
})
})
ack
:=
make
(
chan
struct
{})
ack
:=
make
(
chan
struct
{})
recover
y
q
<-
storRecovery
{
stor
:
stor
,
partTab
:
pt
,
err
:
err
,
ack
:
ack
}
recover
ed
q
<-
storRecovery
{
stor
:
stor
,
partTab
:
pt
,
err
:
err
,
ack
:
ack
}
<-
ack
<-
ack
// canceled recovery does not mean we should down the storage node
// canceled recovery does not mean we should down the storage node
...
@@ -456,16 +456,34 @@ func (m *Master) recovery(ctx context.Context) (err error) {
...
@@ -456,16 +456,34 @@ func (m *Master) recovery(ctx context.Context) (err error) {
}
}
}
}
// during stop: react only to task completion and node leaving
ctxDone
:=
ctx
.
Done
()
ctlStart
:=
m
.
ctlStart
ctlStop
:=
m
.
ctlStop
nodeComeq
:=
m
.
nodeComeq
err
=
nil
stop
:=
func
(
stopErr
error
)
{
if
err
!=
nil
{
return
}
err
=
stopErr
rcancel
()
ctxDone
=
nil
ctlStart
=
nil
ctlStop
=
nil
nodeComeq
=
nil
}
loop
:
loop
:
for
!
(
inprogress
==
0
&&
readyToStart
&&
start
)
{
for
inprogress
>
0
||
!
(
/*start*/
(
readyToStart
&&
start
)
||
/*stop*/
(
err
!=
nil
))
{
select
{
select
{
case
<-
ctx
.
Done
()
:
case
<-
ctxDone
:
err
=
ctx
.
Err
()
stop
(
ctx
.
Err
())
break
loop
// request to start the cluster - if ok we exit replying ok
// request to start the cluster - if ok we exit replying ok
// if not ok - we just reply not ok
// if not ok - we just reply not ok
case
ech
:=
<-
m
.
ctlStart
:
case
ech
:=
<-
ctlStart
:
if
readyToStart
{
if
readyToStart
{
log
.
Infof
(
ctx
,
"start command - we are ready"
)
log
.
Infof
(
ctx
,
"start command - we are ready"
)
// reply "ok to start" after whole recovery finishes
// reply "ok to start" after whole recovery finishes
...
@@ -481,13 +499,13 @@ loop:
...
@@ -481,13 +499,13 @@ loop:
// XXX (depending on storages state)
// XXX (depending on storages state)
ech
<-
nil
ech
<-
nil
}()
}()
break
loop
break
loop
// FIXME
}
}
log
.
Infof
(
ctx
,
"start command - err - we are not ready"
)
log
.
Infof
(
ctx
,
"start command - err - we are not ready"
)
ech
<-
fmt
.
Errorf
(
"start: cluster is non-operational"
)
ech
<-
fmt
.
Errorf
(
"start: cluster is non-operational"
)
case
ech
:=
<-
m
.
ctlStop
:
case
ech
:=
<-
ctlStop
:
close
(
ech
)
// ok; we are already recovering
close
(
ech
)
// ok; we are already recovering
// peer (should be) disconnected
// peer (should be) disconnected
...
@@ -496,7 +514,7 @@ loop:
...
@@ -496,7 +514,7 @@ loop:
updateReadyToStart
()
updateReadyToStart
()
// node comes in and asks to be identified
// node comes in and asks to be identified
case
n
:=
<-
m
.
nodeComeq
:
case
n
:=
<-
nodeComeq
:
peer
,
ok
:=
m
.
identify
(
ctx
,
n
,
peer
,
ok
:=
m
.
identify
(
ctx
,
n
,
// XXX only accept:
// XXX only accept:
// - S -> PENDING
// - S -> PENDING
...
@@ -513,7 +531,7 @@ loop:
...
@@ -513,7 +531,7 @@ loop:
// a storage node came through recovery - let's see whether
// a storage node came through recovery - let's see whether
// ptid ↑ and if so we should take partition table from there
// ptid ↑ and if so we should take partition table from there
case
r
:=
<-
recover
y
q
:
case
r
:=
<-
recover
ed
q
:
close
(
r
.
ack
)
// for <-"node leave" to happen after <-recovery in case of err
close
(
r
.
ack
)
// for <-"node leave" to happen after <-recovery in case of err
inprogress
--
inprogress
--
...
@@ -533,11 +551,12 @@ loop:
...
@@ -533,11 +551,12 @@ loop:
}
}
}
}
/*
// wait all workers to finish (which should come without delay since it was cancelled)
// wait all workers to finish (which should come without delay since it was cancelled)
// XXX not good - some of the rest of the storages can fail in the
// XXX not good - some of the rest of the storages can fail in the
// meantime and this will lead to partTab to become non-opertional.
// meantime and this will lead to partTab to become non-opertional.
// XXX also: some of the recoveries could still _succeed_ (e.g.
// XXX also: some of the recoveries could still _succeed_ (e.g.
// successfuly recovery send was already queued to recover
y
q but not
// successfuly recovery send was already queued to recover
ed
q but not
// yet received) - this successful recovery could bring us newer
// yet received) - this successful recovery could bring us newer
// partTab fro which we should reconsider whether we have all needed
// partTab fro which we should reconsider whether we have all needed
// nodes up and running.
// nodes up and running.
...
@@ -550,7 +569,7 @@ loop:
...
@@ -550,7 +569,7 @@ loop:
loop2:
loop2:
for {
for {
select {
select {
case
r
:=
<-
recover
y
q
:
case r := <-recover
ed
q:
close(r.ack)
close(r.ack)
log.Error(ctx, r.err)
log.Error(ctx, r.err)
...
@@ -559,6 +578,7 @@ loop2:
...
@@ -559,6 +578,7 @@ loop2:
break loop2
break loop2
}
}
}
}
*/
if
err
!=
nil
{
if
err
!=
nil
{
return
err
return
err
...
@@ -618,8 +638,8 @@ func storCtlRecovery(ctx context.Context, stor *_MasteredPeer) (_ *xneo.Partitio
...
@@ -618,8 +638,8 @@ func storCtlRecovery(ctx context.Context, stor *_MasteredPeer) (_ *xneo.Partitio
}
}
var
errStopRequested
=
std
errors
.
New
(
"stop requested"
)
var
errStopRequested
=
errors
.
New
(
"stop requested"
)
var
errClusterDegraded
=
std
errors
.
New
(
"cluster became non-operatonal"
)
var
errClusterDegraded
=
errors
.
New
(
"cluster became non-operatonal"
)
// Cluster Verification (data recovery)
// Cluster Verification (data recovery)
...
@@ -648,7 +668,7 @@ func (m *Master) verify(ctx context.Context) (err error) {
...
@@ -648,7 +668,7 @@ func (m *Master) verify(ctx context.Context) (err error) {
ctx
,
vcancel
:=
context
.
WithCancel
(
ctx
)
ctx
,
vcancel
:=
context
.
WithCancel
(
ctx
)
defer
vcancel
()
defer
vcancel
()
verif
y
q
:=
make
(
chan
storVerify
)
// <- result of stor verify task
verif
ied
q
:=
make
(
chan
storVerify
)
// <- result of stor verify task
inprogress
:=
0
// in-progress verify tasks
inprogress
:=
0
// in-progress verify tasks
// NOTE we don't reset m.lastOid / m.lastTid to 0 in the beginning of verification
// NOTE we don't reset m.lastOid / m.lastTid to 0 in the beginning of verification
...
@@ -667,7 +687,7 @@ func (m *Master) verify(ctx context.Context) (err error) {
...
@@ -667,7 +687,7 @@ func (m *Master) verify(ctx context.Context) (err error) {
})
})
ack
:=
make
(
chan
struct
{})
ack
:=
make
(
chan
struct
{})
verif
y
q
<-
storVerify
{
stor
:
stor
,
lastOid
:
lastOid
,
lastTid
:
lastTid
,
err
:
err
,
ack
:
ack
}
verif
ied
q
<-
storVerify
{
stor
:
stor
,
lastOid
:
lastOid
,
lastTid
:
lastTid
,
err
:
err
,
ack
:
ack
}
<-
ack
<-
ack
// canceled verify does not mean we should down the storage node
// canceled verify does not mean we should down the storage node
...
@@ -747,7 +767,7 @@ func (m *Master) verify(ctx context.Context) (err error) {
...
@@ -747,7 +767,7 @@ func (m *Master) verify(ctx context.Context) (err error) {
// on error check - whether cluster became non-operational and stop verification if so
// on error check - whether cluster became non-operational and stop verification if so
//
//
// FIXME actually implement logic to decide to finish/rollback transactions
// FIXME actually implement logic to decide to finish/rollback transactions
case
v
:=
<-
verif
y
q
:
case
v
:=
<-
verif
ied
q
:
close
(
v
.
ack
)
// XXX explain why (see recovery)
close
(
v
.
ack
)
// XXX explain why (see recovery)
inprogress
--
inprogress
--
...
...
go/neo/t_cluster_test.go
View file @
06685e0b
...
@@ -83,7 +83,7 @@ type tNode struct {
...
@@ -83,7 +83,7 @@ type tNode struct {
// ITestMaster represents tested master node.
// ITestMaster represents tested master node.
type
ITestMaster
interface
{
type
ITestMaster
interface
{
Start
()
error
_
Start
()
error
}
}
// ITestStorage represents tested storage node.
// ITestStorage represents tested storage node.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment