Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
N
neo
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Stefane Fermigier
neo
Commits
7af948cf
Commit
7af948cf
authored
Jan 04, 2017
by
Julien Muchembled
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Lockless stores/checks during replication
parent
b7a5bc99
Changes
11
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
179 additions
and
66 deletions
+179
-66
TODO
TODO
+3
-1
neo/client/app.py
neo/client/app.py
+3
-1
neo/client/handlers/storage.py
neo/client/handlers/storage.py
+7
-4
neo/lib/protocol.py
neo/lib/protocol.py
+6
-4
neo/storage/app.py
neo/storage/app.py
+6
-3
neo/storage/handlers/client.py
neo/storage/handlers/client.py
+4
-4
neo/storage/handlers/master.py
neo/storage/handlers/master.py
+2
-2
neo/storage/replicator.py
neo/storage/replicator.py
+4
-7
neo/storage/transactions.py
neo/storage/transactions.py
+109
-35
neo/tests/storage/testTransactions.py
neo/tests/storage/testTransactions.py
+1
-1
neo/tests/threaded/test.py
neo/tests/threaded/test.py
+34
-4
No files found.
TODO
View file @
7af948cf
...
...
@@ -61,7 +61,9 @@
partitions. Currently, reads succeed because feeding nodes don't delete
anything while the cluster is operational, for performance reasons:
deletion of dropped partitions must be reimplemented in a scalable way.
(HIGH AVAILABILITY)
The same thing happens for writes: storage nodes must discard
stores/checks of dropped partitions (in lockObject, that can be done by
raising ConflictError(None)). (HIGH AVAILABILITY)
Storage
- Use libmysqld instead of a stand-alone MySQL server.
...
...
neo/client/app.py
View file @
7af948cf
...
...
@@ -410,6 +410,8 @@ class Application(ThreadedApplication):
def
store
(
self
,
oid
,
serial
,
data
,
version
,
transaction
):
"""Store object."""
logging
.
debug
(
'storing oid %s serial %s'
,
dump
(
oid
),
dump
(
serial
))
if
not
serial
:
# BBB
serial
=
ZERO_TID
self
.
_store
(
self
.
_txn_container
.
get
(
transaction
),
oid
,
serial
,
data
)
def
_store
(
self
,
txn_context
,
oid
,
serial
,
data
,
data_serial
=
None
):
...
...
@@ -472,7 +474,7 @@ class Application(ThreadedApplication):
oid
,
(
serial
,
conflict_serial
)
=
pop_conflict
()
except
KeyError
:
return
if
conflict_serial
==
ZERO
_TID
:
if
conflict_serial
==
MAX
_TID
:
if
1
:
# XXX: disable deadlock avoidance code until it is fixed
logging
.
info
(
'Deadlock avoidance on %r:%r'
,
...
...
neo/client/handlers/storage.py
View file @
7af948cf
...
...
@@ -17,7 +17,7 @@
from
ZODB.TimeStamp
import
TimeStamp
from
neo.lib
import
logging
from
neo.lib.protocol
import
ZERO
_TID
from
neo.lib.protocol
import
MAX
_TID
from
neo.lib.util
import
dump
from
neo.lib.exception
import
NodeNotReady
from
neo.lib.handler
import
MTEventHandler
...
...
@@ -62,10 +62,13 @@ class StorageAnswersHandler(AnswerBaseHandler):
self
.
app
.
setHandlerData
(
args
)
def
answerStoreObject
(
self
,
conn
,
conflict
,
oid
,
serial
):
if
not
conflict
:
# Ignore if not locked on storage side.
return
txn_context
=
self
.
app
.
getHandlerData
()
object_stored_counter_dict
=
txn_context
[
'object_stored_counter_dict'
][
oid
]
if
conflict
:
if
conflict
!=
serial
:
# Conflicts can not be resolved now because 'conn' is locked.
# We must postpone the resolution (by queuing the conflict in
# 'conflict_dict') to avoid any deadlock with another thread that
...
...
@@ -76,10 +79,10 @@ class StorageAnswersHandler(AnswerBaseHandler):
# receive the conflict answer from the first store on S2.
logging
.
info
(
'%r report a conflict for %r with %r'
,
conn
,
dump
(
oid
),
dump
(
conflict
))
if
conflict
!=
ZERO
_TID
:
if
conflict
!=
MAX
_TID
:
# If this conflict is not already resolved, mark it for
# resolution.
if
conflict
<=
txn_context
[
'resolved_dict'
].
get
(
oid
,
ZERO_TID
):
if
conflict
<=
txn_context
[
'resolved_dict'
].
get
(
oid
,
''
):
return
if
conflict
in
object_stored_counter_dict
:
raise
NEOStorageError
(
'Storages %s accepted object %s'
...
...
neo/lib/protocol.py
View file @
7af948cf
...
...
@@ -940,10 +940,12 @@ class StoreObject(Packet):
"""
Ask to store an object. Send an OID, an original serial, a current
transaction ID, and data. C -> S.
Answer if an object has been stored. If an object is in conflict,
a serial of the conflicting transaction is returned. In this case,
if this serial is newer than the current transaction ID, a client
node must not try to resolve the conflict. S -> C.
As for IStorage, 'serial' is ZERO_TID for new objects.
Answered 'conflict' value means:
- None: lockless
- serial: ok
- MAX_TID: deadlock
- else: conflict
"""
_fmt
=
PStruct
(
'ask_store_object'
,
POID
(
'oid'
),
...
...
neo/storage/app.py
View file @
7af948cf
...
...
@@ -38,13 +38,14 @@ from neo.lib.debug import register as registerLiveDebugger
class
Application
(
BaseApplication
):
"""The storage node application."""
tm
=
None
def
__init__
(
self
,
config
):
super
(
Application
,
self
).
__init__
(
config
.
getSSL
(),
config
.
getDynamicMasterList
())
# set the cluster name
self
.
name
=
config
.
getCluster
()
self
.
tm
=
TransactionManager
(
self
)
self
.
dm
=
buildDatabaseManager
(
config
.
getAdapter
(),
(
config
.
getDatabase
(),
config
.
getEngine
(),
config
.
getWait
()),
)
...
...
@@ -93,7 +94,8 @@ class Application(BaseApplication):
def
log
(
self
):
self
.
em
.
log
()
self
.
nm
.
log
()
self
.
tm
.
log
()
if
self
.
tm
:
self
.
tm
.
log
()
if
self
.
pt
is
not
None
:
self
.
pt
.
log
()
...
...
@@ -184,6 +186,7 @@ class Application(BaseApplication):
for
conn
in
self
.
em
.
getConnectionList
():
if
conn
not
in
(
self
.
listening_conn
,
self
.
master_conn
):
conn
.
close
()
self
.
tm
=
TransactionManager
(
self
)
try
:
self
.
initialize
()
self
.
doOperation
()
...
...
@@ -194,6 +197,7 @@ class Application(BaseApplication):
logging
.
error
(
'primary master is down: %s'
,
msg
)
finally
:
self
.
checker
=
Checker
(
self
)
del
self
.
tm
def
connectToPrimary
(
self
):
"""Find a primary master node, and connect to it.
...
...
@@ -256,7 +260,6 @@ class Application(BaseApplication):
# Forget all unfinished data.
self
.
dm
.
dropUnfinishedData
()
self
.
tm
.
reset
()
self
.
task_queue
=
task_queue
=
deque
()
try
:
...
...
neo/storage/handlers/client.py
View file @
7af948cf
...
...
@@ -72,7 +72,7 @@ class ClientOperationHandler(EventHandler):
def
_askStoreObject
(
self
,
conn
,
oid
,
serial
,
compression
,
checksum
,
data
,
data_serial
,
ttid
,
request_time
):
try
:
self
.
app
.
tm
.
storeObject
(
ttid
,
serial
,
oid
,
compression
,
locked
=
self
.
app
.
tm
.
storeObject
(
ttid
,
serial
,
oid
,
compression
,
checksum
,
data
,
data_serial
)
except
ConflictError
,
err
:
# resolvable or not
...
...
@@ -93,7 +93,7 @@ class ClientOperationHandler(EventHandler):
duration
=
time
.
time
()
-
request_time
if
duration
>
SLOW_STORE
:
logging
.
info
(
'StoreObject delay: %.02fs'
,
duration
)
conn
.
answer
(
Packets
.
AnswerStoreObject
(
None
))
conn
.
answer
(
Packets
.
AnswerStoreObject
(
locked
))
def
askStoreObject
(
self
,
conn
,
oid
,
serial
,
compression
,
checksum
,
data
,
data_serial
,
ttid
):
...
...
@@ -171,7 +171,7 @@ class ClientOperationHandler(EventHandler):
def
_askCheckCurrentSerial
(
self
,
conn
,
ttid
,
serial
,
oid
,
request_time
):
try
:
self
.
app
.
tm
.
checkCurrentSerial
(
ttid
,
serial
,
oid
)
locked
=
self
.
app
.
tm
.
checkCurrentSerial
(
ttid
,
serial
,
oid
)
except
ConflictError
,
err
:
# resolvable or not
conn
.
answer
(
Packets
.
AnswerCheckCurrentSerial
(
err
.
tid
))
...
...
@@ -191,7 +191,7 @@ class ClientOperationHandler(EventHandler):
duration
=
time
.
time
()
-
request_time
if
duration
>
SLOW_STORE
:
logging
.
info
(
'CheckCurrentSerial delay: %.02fs'
,
duration
)
conn
.
answer
(
Packets
.
AnswerCheckCurrentSerial
(
None
))
conn
.
answer
(
Packets
.
AnswerCheckCurrentSerial
(
locked
))
# like ClientOperationHandler but read-only & only for tid <= backup_tid
...
...
neo/storage/handlers/master.py
View file @
7af948cf
...
...
@@ -31,8 +31,8 @@ class MasterOperationHandler(BaseMasterHandler):
dm
.
_setBackupTID
(
dm
.
getLastIDs
()[
0
]
or
ZERO_TID
)
dm
.
commit
()
def
notifyTransactionFinished
(
self
,
conn
,
*
args
,
**
kw
):
self
.
app
.
replicator
.
transactionFinished
(
*
args
,
**
kw
)
def
notifyTransactionFinished
(
self
,
conn
,
*
args
):
self
.
app
.
replicator
.
transactionFinished
(
*
args
)
def
notifyPartitionChanges
(
self
,
conn
,
ptid
,
cell_list
):
"""This is very similar to Send Partition Table, except that
...
...
neo/storage/replicator.py
View file @
7af948cf
...
...
@@ -136,7 +136,7 @@ class Replicator(object):
app
=
self
.
app
pt
=
app
.
pt
uuid
=
app
.
uuid
self
.
partition_dict
=
p
=
{}
self
.
partition_dict
=
{}
self
.
replicate_dict
=
{}
self
.
source_dict
=
{}
self
.
ttid_set
=
set
()
...
...
@@ -160,8 +160,7 @@ class Replicator(object):
p
.
next_trans
=
p
.
next_obj
=
next_tid
p
.
max_ttid
=
None
if
outdated_list
:
self
.
app
.
master_conn
.
ask
(
Packets
.
AskUnfinishedTransactions
(),
offset_list
=
outdated_list
)
self
.
app
.
tm
.
replicating
(
outdated_list
)
def
notifyPartitionChanges
(
self
,
cell_list
):
"""This is a callback from MasterOperationHandler."""
...
...
@@ -190,8 +189,7 @@ class Replicator(object):
p
.
max_ttid
=
INVALID_TID
added_list
.
append
(
offset
)
if
added_list
:
self
.
app
.
master_conn
.
ask
(
Packets
.
AskUnfinishedTransactions
(),
offset_list
=
added_list
)
self
.
app
.
tm
.
replicating
(
added_list
)
if
abort
:
self
.
abort
()
...
...
@@ -326,8 +324,7 @@ class Replicator(object):
p
.
next_obj
=
add64
(
tid
,
1
)
self
.
updateBackupTID
()
if
not
p
.
max_ttid
:
p
=
Packets
.
NotifyReplicationDone
(
offset
,
tid
)
self
.
app
.
master_conn
.
notify
(
p
)
self
.
app
.
tm
.
replicated
(
offset
,
tid
)
logging
.
debug
(
"partition %u replicated up to %s from %r"
,
offset
,
dump
(
tid
),
self
.
current_node
)
self
.
getCurrentConnection
().
setReconnectionNoDelay
()
...
...
neo/storage/transactions.py
View file @
7af948cf
This diff is collapsed.
Click to expand it.
neo/tests/storage/testTransactions.py
View file @
7af948cf
...
...
@@ -28,7 +28,7 @@ class TransactionManagerTests(NeoUnitTestBase):
self
.
app
=
Mock
()
# no history
self
.
app
.
dm
=
Mock
({
'getObjectHistory'
:
[]})
self
.
app
.
pt
=
Mock
({
'isAssigned'
:
True
})
self
.
app
.
pt
=
Mock
({
'isAssigned'
:
True
,
'getPartitions'
:
2
})
self
.
app
.
em
=
Mock
({
'setTimeout'
:
None
})
self
.
manager
=
TransactionManager
(
self
.
app
)
...
...
neo/tests/threaded/test.py
View file @
7af948cf
...
...
@@ -33,7 +33,7 @@ from neo.lib.exception import DatabaseFailure, StoppedOperation
from
neo.lib.protocol
import
CellStates
,
ClusterStates
,
NodeStates
,
Packets
,
\
ZERO_OID
,
ZERO_TID
from
..
import
expectedFailure
,
Patch
from
.
import
LockLock
,
NEOThreadedTest
,
with_cluster
from
.
import
ConnectionFilter
,
LockLock
,
NEOThreadedTest
,
with_cluster
from
neo.lib.util
import
add64
,
makeChecksum
,
p64
,
u64
from
neo.client.exception
import
NEOPrimaryMasterLost
,
NEOStorageError
from
neo.client.pool
import
CELL_CONNECTED
,
CELL_GOOD
...
...
@@ -1351,11 +1351,11 @@ class Test(NEOThreadedTest):
reports a conflict after that this conflict was fully resolved with
another node.
"""
def
answerStoreObject
(
orig
,
conn
,
conflict
,
**
kw
):
if
not
conflict
:
def
answerStoreObject
(
orig
,
conn
,
conflict
,
oid
,
serial
):
if
conflict
==
serial
:
p
.
revert
()
ll
()
orig
(
conn
,
conflict
,
**
kw
)
orig
(
conn
,
conflict
,
oid
,
serial
)
if
1
:
s0
,
s1
=
cluster
.
storage_list
t1
,
c1
=
cluster
.
getTransaction
()
...
...
@@ -1389,6 +1389,36 @@ class Test(NEOThreadedTest):
storage
.
store
(
oid
,
None
,
'*'
*
storage
.
_cache
.
_max_size
,
''
,
txn
)
self
.
assertRaises
(
POSException
.
ConflictError
,
storage
.
tpc_vote
,
txn
)
@
with_cluster
(
replicas
=
1
)
def
testConflictWithOutOfDateCell
(
self
,
cluster
):
"""
C1 S1 S0 C2
begin down begin
U <------- commit
up (remaining out-of-date due to suspended replication)
store ---> O (stored lockless)
`--------------> conflict
resolve -> stored lockless
`------------> locked
committed
"""
s0
,
s1
=
cluster
.
storage_list
t1
,
c1
=
cluster
.
getTransaction
()
c1
.
root
()[
'x'
]
=
x
=
PCounterWithResolution
()
t1
.
commit
()
s1
.
stop
()
cluster
.
join
((
s1
,))
x
.
value
+=
1
t2
,
c2
=
cluster
.
getTransaction
()
c2
.
root
()[
'x'
].
value
+=
2
t2
.
commit
()
with
ConnectionFilter
()
as
f
:
f
.
delayAskFetchTransactions
()
s1
.
resetNode
()
s1
.
start
()
self
.
tic
()
t1
.
commit
()
if
__name__
==
"__main__"
:
unittest
.
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment