Commit 17af3b47 authored by Julien Muchembled's avatar Julien Muchembled

master: fix possibly wrong knowledge of cells' backup_tid when resuming backup

The issue happens when there were commits while the backup cluster was down.
In this case, the master thinks that these commits are already replicated,
reporting wrong backup_tid to neoctl. It solved by itself once:
- there are new commits triggering replication for all partitions;
- all storage nodes have really replicated.

This also resulted in an inconsistent database when leaving backup mode during
this period.
parent c95c6c39
...@@ -194,9 +194,8 @@ class BackupApplication(object): ...@@ -194,9 +194,8 @@ class BackupApplication(object):
for node in trigger_set: for node in trigger_set:
self.triggerBackup(node) self.triggerBackup(node)
def invalidatePartitions(self, tid, partition_set): def invalidatePartitions(self, tid, prev_tid, partition_set):
app = self.app app = self.app
prev_tid = app.getLastTransaction()
app.setLastTransaction(tid) app.setLastTransaction(tid)
pt = app.pt pt = app.pt
trigger_set = set() trigger_set = set()
......
...@@ -34,10 +34,18 @@ class BackupHandler(EventHandler): ...@@ -34,10 +34,18 @@ class BackupHandler(EventHandler):
def answerLastTransaction(self, conn, tid): def answerLastTransaction(self, conn, tid):
app = self.app app = self.app
if tid != ZERO_TID: prev_tid = app.app.getLastTransaction()
app.invalidatePartitions(tid, set(xrange(app.pt.getPartitions()))) if prev_tid < tid:
else: # upstream DB is empty # Since we don't know which partitions were modified during our
assert app.app.getLastTransaction() == tid # absence, we must force replication on all storages. As long as
# they haven't done this first check, our backup tid will remain
# inferior to this 'tid'. We don't know the real prev_tid, which is:
# >= app.app.getLastTransaction()
# < tid
# but passing 'tid' is good enough.
app.invalidatePartitions(tid, tid, xrange(app.pt.getPartitions()))
elif prev_tid != tid:
raise RuntimeError("upstream DB truncated")
app.ignore_invalidations = False app.ignore_invalidations = False
def invalidateObjects(self, conn, tid, oid_list): def invalidateObjects(self, conn, tid, oid_list):
...@@ -47,4 +55,5 @@ class BackupHandler(EventHandler): ...@@ -47,4 +55,5 @@ class BackupHandler(EventHandler):
getPartition = app.app.pt.getPartition getPartition = app.app.pt.getPartition
partition_set = set(map(getPartition, oid_list)) partition_set = set(map(getPartition, oid_list))
partition_set.add(getPartition(tid)) partition_set.add(getPartition(tid))
app.invalidatePartitions(tid, partition_set) prev_tid = app.app.getLastTransaction()
app.invalidatePartitions(tid, prev_tid, partition_set)
...@@ -333,6 +333,29 @@ class ReplicationTests(NEOThreadedTest): ...@@ -333,6 +333,29 @@ class ReplicationTests(NEOThreadedTest):
finally: finally:
upstream.stop() upstream.stop()
@backup_test()
def testBackupTid(self, backup):
"""
Check that the backup cluster does not claim it has all the data just
after it came back whereas new transactions were committed during its
absence.
"""
importZODB = backup.upstream.importZODB()
importZODB(1)
self.tic()
last_tid = backup.upstream.last_tid
self.assertEqual(last_tid, backup.backup_tid)
backup.stop()
importZODB(1)
backup.reset()
with ConnectionFilter() as f:
f.add(lambda conn, packet:
isinstance(packet, Packets.AskFetchTransactions))
backup.start()
self.assertEqual(last_tid, backup.backup_tid)
self.tic()
self.assertEqual(1, self.checkBackup(backup))
def testSafeTweak(self): def testSafeTweak(self):
""" """
Check that tweak always tries to keep a minimum of (replicas + 1) Check that tweak always tries to keep a minimum of (replicas + 1)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment