Commit d0ac62aa authored by unknown's avatar unknown

bug#9924 - ndb backup abort handling

  Redo abort handling according to descr. in Backup.txt
bug#9960 - ndb backup
      increase wait completed timeout to 48 hours


ndb/include/kernel/signaldata/BackupImpl.hpp:
  Add nodeid to reply to be able to fake reply during NF
ndb/include/kernel/signaldata/BackupSignalData.hpp:
  new error codes
ndb/src/common/debugger/signaldata/BackupImpl.cpp:
  fix printout
ndb/src/kernel/blocks/backup/Backup.cpp:
  bug#9924 - ndb backup abort handling
    Redo abort handling according to descr. in Backup.txt
ndb/src/kernel/blocks/backup/Backup.hpp:
  bug#9924 - ndb backup abort handling
    Redo abort handling according to descr. in Backup.txt
ndb/src/kernel/blocks/backup/Backup.txt:
  bug#9924 - ndb backup abort handling
    Redo abort handling according to descr. in Backup.txt
ndb/src/kernel/blocks/backup/BackupInit.cpp:
  bug#9924 - ndb backup abort handling
    Redo abort handling according to descr. in Backup.txt
ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp:
  Init own version
ndb/src/mgmapi/mgmapi.cpp:
  bug#9960 - ndb backup
    increase wait completed timeout to 48 hours
ndb/src/mgmsrv/MgmtSrvr.cpp:
  Handle node failures activly
    (mainly for backup...)
ndb/src/mgmsrv/MgmtSrvr.hpp:
  Handle node failures activly
    (mainly for backup...)
ndb/src/mgmsrv/MgmtSrvrGeneralSignalHandling.cpp:
  Handle node failures activly
    (mainly for backup...)
ndb/src/ndbapi/ndberror.c:
  new error codes
ndb/test/ndbapi/testBackup.cpp:
  fix return codes
ndb/test/run-test/daily-basic-tests.txt:
  Add failure test cases to autotest
ndb/test/src/NdbBackup.cpp:
  fix error codes
  introduce checking of backup resources after each test
parent 30bd6f51
......@@ -75,7 +75,7 @@ class DefineBackupRef {
friend bool printDEFINE_BACKUP_REF(FILE *, const Uint32 *, Uint32, Uint16);
public:
STATIC_CONST( SignalLength = 3 );
STATIC_CONST( SignalLength = 4 );
enum ErrorCode {
Undefined = 1340,
......@@ -92,6 +92,7 @@ private:
Uint32 backupId;
Uint32 backupPtr;
Uint32 errorCode;
Uint32 nodeId;
};
class DefineBackupConf {
......@@ -158,7 +159,7 @@ class StartBackupRef {
friend bool printSTART_BACKUP_REF(FILE *, const Uint32 *, Uint32, Uint16);
public:
STATIC_CONST( SignalLength = 4 );
STATIC_CONST( SignalLength = 5 );
enum ErrorCode {
FailedToAllocateTriggerRecord = 1
......@@ -168,6 +169,7 @@ private:
Uint32 backupPtr;
Uint32 signalNo;
Uint32 errorCode;
Uint32 nodeId;
};
class StartBackupConf {
......@@ -232,9 +234,8 @@ public:
private:
Uint32 backupId;
Uint32 backupPtr;
Uint32 tableId;
Uint32 fragmentNo;
Uint32 errorCode;
Uint32 nodeId;
};
class BackupFragmentConf {
......@@ -296,12 +297,13 @@ class StopBackupRef {
friend bool printSTOP_BACKUP_REF(FILE *, const Uint32 *, Uint32, Uint16);
public:
STATIC_CONST( SignalLength = 3 );
STATIC_CONST( SignalLength = 4 );
private:
Uint32 backupId;
Uint32 backupPtr;
Uint32 errorCode;
Uint32 nodeId;
};
class StopBackupConf {
......
......@@ -240,6 +240,9 @@ public:
FileOrScanError = 1325, // slave -> coordinator
BackupFailureDueToNodeFail = 1326, // slave -> slave
OkToClean = 1327 // master -> slave
,AbortScan = 1328
,IncompatibleVersions = 1329
};
private:
Uint32 requestType;
......
......@@ -90,10 +90,8 @@ printBACKUP_FRAGMENT_REQ(FILE * out, const Uint32 * data, Uint32 l, Uint16 bno){
bool
printBACKUP_FRAGMENT_REF(FILE * out, const Uint32 * data, Uint32 l, Uint16 bno){
BackupFragmentRef* sig = (BackupFragmentRef*)data;
fprintf(out, " backupPtr: %d backupId: %d\n",
sig->backupPtr, sig->backupId);
fprintf(out, " tableId: %d fragmentNo: %d errorCode: %d\n",
sig->tableId, sig->fragmentNo, sig->errorCode);
fprintf(out, " backupPtr: %d backupId: %d nodeId: %d errorCode: %d\n",
sig->backupPtr, sig->backupId, sig->nodeId, sig->errorCode);
return true;
}
......
This diff is collapsed.
......@@ -232,6 +232,7 @@ public:
*/
bool newScan();
bool scanConf(Uint32 noOfOps, Uint32 opLen);
bool closeScan();
/**
* Per record
......@@ -330,7 +331,7 @@ public:
Uint8 fileOpened;
Uint8 fileRunning;
Uint8 fileDone;
Uint8 fileClosing;
Uint8 scanRunning;
};
typedef Ptr<BackupFile> BackupFilePtr;
......@@ -403,13 +404,11 @@ public:
ArrayPool<TriggerRecord> & trp)
: slaveState(b, validSlaveTransitions, validSlaveTransitionsCount,1)
, tables(tp), triggers(trp), files(bp), pages(pp)
, masterData(b, validMasterTransitions, validMasterTransitionsCount)
, backup(b)
{
closingFiles = false;
okToCleanMaster = true;
}
, masterData(b), backup(b)
{
}
Uint32 m_gsn;
CompoundState slaveState;
Uint32 clientRef;
......@@ -420,9 +419,6 @@ public:
Uint32 errorCode;
NdbNodeBitmask nodes;
bool okToCleanMaster;
bool closingFiles;
Uint64 noOfBytes;
Uint64 noOfRecords;
Uint64 noOfLogBytes;
......@@ -444,15 +440,13 @@ public:
SimpleProperties props;// Used for (un)packing backup request
struct MasterData {
MasterData(Backup & b, const State valid[], Uint32 count)
: state(b, valid, count, 0)
{
}
MasterData(Backup & b)
{
}
MutexHandle2<BACKUP_DEFINE_MUTEX> m_defineBackupMutex;
MutexHandle2<DICT_COMMIT_TABLE_MUTEX> m_dictCommitTableMutex;
Uint32 gsn;
CompoundState state;
SignalCounter sendCounter;
Uint32 errorCode;
struct {
......@@ -557,7 +551,8 @@ public:
void stopBackupReply(Signal* signal, BackupRecordPtr ptr, Uint32 nodeId);
void defineBackupRef(Signal*, BackupRecordPtr, Uint32 errCode = 0);
void backupFragmentRef(Signal * signal, BackupFilePtr filePtr);
void nextFragment(Signal*, BackupRecordPtr);
void sendCreateTrig(Signal*, BackupRecordPtr ptr, TablePtr tabPtr);
......@@ -578,14 +573,14 @@ public:
void sendAbortBackupOrd(Signal* signal, BackupRecordPtr ptr, Uint32 errCode);
void sendAbortBackupOrdSlave(Signal* signal, BackupRecordPtr ptr,
Uint32 errCode);
void masterAbort(Signal*, BackupRecordPtr ptr, bool controlledAbort);
void masterAbort(Signal*, BackupRecordPtr ptr);
void masterSendAbortBackup(Signal*, BackupRecordPtr ptr);
void slaveAbort(Signal*, BackupRecordPtr ptr);
void abortFile(Signal* signal, BackupRecordPtr ptr, BackupFilePtr filePtr);
void abortFileHook(Signal* signal, BackupFilePtr filePtr, bool scanDone);
bool verifyNodesAlive(const NdbNodeBitmask& aNodeBitMask);
bool verifyNodesAlive(BackupRecordPtr, const NdbNodeBitmask& aNodeBitMask);
bool checkAbort(BackupRecordPtr ptr);
void checkNodeFail(Signal* signal,
BackupRecordPtr ptr,
......@@ -603,9 +598,8 @@ public:
void sendBackupRef(BlockReference ref, Signal *signal,
Uint32 senderData, Uint32 errorCode);
void dumpUsedResources();
void cleanupMasterResources(BackupRecordPtr ptr);
void cleanupSlaveResources(BackupRecordPtr ptr);
void cleanupFinalResources(BackupRecordPtr ptr);
void cleanup(Signal*, BackupRecordPtr ptr);
void abort_scan(Signal*, BackupRecordPtr ptr);
void removeBackup(Signal*, BackupRecordPtr ptr);
void sendSTTORRY(Signal*);
......
......@@ -341,3 +341,28 @@ start backup
(ERROR_INSERTED(10022))) {
if (ERROR_INSERTED(10029)) {
if(trigPtr.p->operation->noOfBytes > 123 && ERROR_INSERTED(10030)) {
----- XXX ---
DEFINE_BACKUP_REF ->
ABORT_BACKUP_ORD(no reply) when all DEFINE_BACKUP replies has arrived
START_BACKUP_REF
ABORT_BACKUP_ORD(no reply) when all START_BACKUP_ replies has arrived
BACKUP_FRAGMENT_REF
ABORT_BACKUP_ORD(reply) directly to all nodes running BACKUP_FRAGMENT
When all nodes has replied BACKUP_FRAGMENT
ABORT_BACKUP_ORD(no reply)
STOP_BACKUP_REF
ABORT_BACKUP_ORD(no reply) when all STOP_BACKUP_ replies has arrived
NF_COMPLETE_REP
slave dies
master sends OUTSTANDING_REF to self
slave does nothing
master dies
slave elects self as master and sets only itself as participant
......@@ -175,7 +175,7 @@ Backup::Backup(const Configuration & conf) :
addRecSignal(GSN_START_BACKUP_CONF, &Backup::execSTART_BACKUP_CONF);
addRecSignal(GSN_BACKUP_FRAGMENT_REQ, &Backup::execBACKUP_FRAGMENT_REQ);
//addRecSignal(GSN_BACKUP_FRAGMENT_REF, &Backup::execBACKUP_FRAGMENT_REF);
addRecSignal(GSN_BACKUP_FRAGMENT_REF, &Backup::execBACKUP_FRAGMENT_REF);
addRecSignal(GSN_BACKUP_FRAGMENT_CONF, &Backup::execBACKUP_FRAGMENT_CONF);
addRecSignal(GSN_STOP_BACKUP_REQ, &Backup::execSTOP_BACKUP_REQ);
......
......@@ -126,6 +126,7 @@ Cmvmi::Cmvmi(const Configuration & conf) :
}
setNodeInfo(getOwnNodeId()).m_connected = true;
setNodeInfo(getOwnNodeId()).m_version = ndbGetOwnVersion();
}
Cmvmi::~Cmvmi()
......
......@@ -1565,9 +1565,9 @@ ndb_mgm_start_backup(NdbMgmHandle handle, int wait_completed,
{ // start backup can take some time, set timeout high
Uint64 old_timeout= handle->read_timeout;
if (wait_completed == 2)
handle->read_timeout= 30*60*1000; // 30 minutes
handle->read_timeout= 48*60*60*1000; // 48 hours
else if (wait_completed == 1)
handle->read_timeout= 5*60*1000; // 5 minutes
handle->read_timeout= 10*60*1000; // 10 minutes
reply = ndb_mgm_call(handle, start_backup_reply, "start backup", &args);
handle->read_timeout= old_timeout;
}
......
......@@ -791,7 +791,7 @@ MgmtSrvr::restartNode(int processId, bool nostart,
result = sendSignal(processId, NO_WAIT, signal, true);
}
if (result == -1) {
if (result == -1 && theWaitState != WAIT_NODEFAILURE) {
m_stopRec.inUse = false;
return SEND_OR_RECEIVE_FAILED;
}
......@@ -1920,6 +1920,7 @@ MgmtSrvr::handleReceivedSignal(NdbApiSignal* signal)
#ifdef VM_TRACE
ndbout_c("I'm not master resending to %d", aNodeId);
#endif
theWaitNode= aNodeId;
NdbApiSignal aSignal(_ownReference);
BackupReq* req = CAST_PTR(BackupReq, aSignal.getDataPtrSend());
aSignal.set(TestOrd::TraceAPI, BACKUP, GSN_BACKUP_REQ,
......@@ -1947,6 +1948,7 @@ MgmtSrvr::handleReceivedSignal(NdbApiSignal* signal)
event.Event = BackupEvent::BackupAborted;
event.Aborted.Reason = rep->reason;
event.Aborted.BackupId = rep->backupId;
event.Aborted.ErrorCode = rep->reason;
backupCallback(event);
}
break;
......@@ -2076,6 +2078,13 @@ MgmtSrvr::handleStatus(NodeId nodeId, bool alive, bool nfComplete)
handleStopReply(nodeId, 0);
DBUG_VOID_RETURN;
}
if(theWaitNode == nodeId &&
theWaitState != NO_WAIT && theWaitState != WAIT_STOP)
{
theWaitState = WAIT_NODEFAILURE;
NdbCondition_Signal(theMgmtWaitForResponseCondPtr);
}
}
eventReport(_ownNodeId, theData);
......@@ -2427,7 +2436,7 @@ MgmtSrvr::startBackup(Uint32& backupId, int waitCompleted)
int result;
if (waitCompleted == 2) {
result = sendRecSignal(nodeId, WAIT_BACKUP_COMPLETED,
signal, true, 30*60*1000 /*30 secs*/);
signal, true, 48*60*60*1000 /* 48 hours */);
}
else if (waitCompleted == 1) {
result = sendRecSignal(nodeId, WAIT_BACKUP_STARTED,
......@@ -2456,22 +2465,6 @@ MgmtSrvr::startBackup(Uint32& backupId, int waitCompleted)
return -1;
break;
}
} else {
switch(m_lastBackupEvent.Event){
case BackupEvent::BackupCompleted:
backupId = m_lastBackupEvent.Completed.BackupId;
break;
case BackupEvent::BackupStarted:
backupId = m_lastBackupEvent.Started.BackupId;
break;
case BackupEvent::BackupFailedToStart:
return m_lastBackupEvent.FailedToStart.ErrorCode;
case BackupEvent::BackupAborted:
return m_lastBackupEvent.Aborted.ErrorCode;
default:
return -1;
break;
}
}
return 0;
......
......@@ -611,7 +611,8 @@ private:
WAIT_STOP,
WAIT_BACKUP_STARTED,
WAIT_BACKUP_COMPLETED,
WAIT_VERSION
WAIT_VERSION,
WAIT_NODEFAILURE
};
/**
......@@ -695,6 +696,7 @@ private:
NdbApiSignal* theSignalIdleList;
// List of unused signals
Uint32 theWaitNode;
WaitSignalType theWaitState;
// State denoting a set of signals we accept to recieve.
......
......@@ -108,6 +108,7 @@ MgmtSrvr::sendRecSignal(Uint16 aNodeId,
return -1;
}
theWaitState = aWaitState;
theWaitNode = aNodeId;
return receiveOptimisedResponse(waitTime);
}
......@@ -119,11 +120,12 @@ MgmtSrvr::receiveOptimisedResponse(int waitTime)
theFacade->checkForceSend(_blockNumber);
NDB_TICKS maxTime = NdbTick_CurrentMillisecond() + waitTime;
while (theWaitState != NO_WAIT && waitTime > 0) {
while (theWaitState != NO_WAIT && theWaitState != WAIT_NODEFAILURE
&& waitTime > 0) {
NdbCondition_WaitTimeout(theMgmtWaitForResponseCondPtr,
theFacade->theMutexPtr,
waitTime);
if(theWaitState == NO_WAIT)
if(theWaitState == NO_WAIT || theWaitState == WAIT_NODEFAILURE)
break;
waitTime = (maxTime - NdbTick_CurrentMillisecond());
}//while
......
......@@ -345,7 +345,7 @@ ErrorBundle ErrorCodes[] = {
{ 1325, IE, "File or scan error" },
{ 1326, IE, "Backup abortet due to node failure" },
{ 1327, IE, "1327" },
{ 1340, IE, "Backup undefined error" },
{ 1342, AE, "Backup failed to allocate buffers (check configuration)" },
{ 1343, AE, "Backup failed to setup fs buffers (check configuration)" },
......@@ -355,7 +355,8 @@ ErrorBundle ErrorCodes[] = {
{ 1347, AE, "Backup failed to allocate table memory (check configuration)" },
{ 1348, AE, "Backup failed to allocate file record (check configuration)" },
{ 1349, AE, "Backup failed to allocate attribute record (check configuration)" },
{ 1329, AE, "Backup during software upgrade not supported" },
/**
* Still uncategorized
*/
......
......@@ -74,20 +74,20 @@ int runAbort(NDBT_Context* ctx, NDBT_Step* step){
if (testMaster) {
if (testSlave) {
if (backup.NFMasterAsSlave(restarter) == -1){
if (backup.NFMasterAsSlave(restarter) != NDBT_OK){
return NDBT_FAILED;
}
} else {
if (backup.NFMaster(restarter) == -1){
if (backup.NFMaster(restarter) != NDBT_OK){
return NDBT_FAILED;
}
}
} else {
if (backup.NFSlave(restarter) == -1){
if (backup.NFSlave(restarter) != NDBT_OK){
return NDBT_FAILED;
}
}
return NDBT_OK;
}
......@@ -108,16 +108,16 @@ int runFail(NDBT_Context* ctx, NDBT_Step* step){
if (testMaster) {
if (testSlave) {
if (backup.FailMasterAsSlave(restarter) == -1){
if (backup.FailMasterAsSlave(restarter) != NDBT_OK){
return NDBT_FAILED;
}
} else {
if (backup.FailMaster(restarter) == -1){
if (backup.FailMaster(restarter) != NDBT_OK){
return NDBT_FAILED;
}
}
} else {
if (backup.FailSlave(restarter) == -1){
if (backup.FailSlave(restarter) != NDBT_OK){
return NDBT_FAILED;
}
}
......
......@@ -2,6 +2,30 @@ max-time: 3600
cmd: atrt-mysql-test-run
args: --force
max-time: 600
cmd: atrt-testBackup
args: -n NFMaster T1
max-time: 600
cmd: atrt-testBackup
args: -n NFMasterAsSlave T1
max-time: 600
cmd: atrt-testBackup
args: -n NFSlave T1
max-time: 600
cmd: atrt-testBackup
args: -n FailMaster T1
max-time: 600
cmd: atrt-testBackup
args: -n FailMasterAsSlave T1
max-time: 600
cmd: atrt-testBackup
args: -n FailSlave T1
max-time: 600
cmd: atrt-testBackup
args: -n BackupOne T1 T6 T3 I3
......
......@@ -245,6 +245,10 @@ NdbBackup::NFSlave(NdbRestarter& _restarter){
int
NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz, bool onMaster){
{
int nNodes = _restarter.getNumDbNodes();
if(nNodes == 1)
return NDBT_OK;
int nodeId = _restarter.getMasterNodeId();
CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0,
......@@ -255,15 +259,11 @@ NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz,
CHECK(_restarter.startNodes(&nodeId, 1) == 0,
"failed to start node");
NdbSleep_SecSleep(10);
}
CHECK(_restarter.waitClusterStarted() == 0,
"waitClusterStarted failed");
int nNodes = _restarter.getNumDbNodes();
myRandom48Init(NdbTick_CurrentMillisecond());
for(int i = 0; i<sz; i++){
......@@ -296,6 +296,7 @@ NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz,
"failed to set error insert");
g_info << "error inserted" << endl;
NdbSleep_SecSleep(1);
g_info << "starting backup" << endl;
int r = start(backupId);
......@@ -304,6 +305,7 @@ NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz,
if (r == 0) {
g_err << "Backup should have failed on error_insertion " << error << endl
<< "Master = " << masterNodeId << "Node = " << nodeId << endl;
return NDBT_FAILED;
}
CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0,
......@@ -316,8 +318,6 @@ NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz,
return NDBT_FAILED;
}
NdbSleep_SecSleep(1);
g_info << "starting new backup" << endl;
CHECK(start(backupId) == 0,
"failed to start backup");
......@@ -331,8 +331,14 @@ NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz,
"waitClusterStarted failed");
g_info << "node started" << endl;
int val2[] = { 24, 2424 };
CHECK(_restarter.dumpStateAllNodes(val2, 2) == 0,
"failed to check backup resources RestartOnErrorInsert");
CHECK(_restarter.insertErrorInNode(nodeId, 10099) == 0,
"failed to set error insert");
NdbSleep_SecSleep(1);
}
return NDBT_OK;
......@@ -340,15 +346,8 @@ NdbBackup::NF(NdbRestarter& _restarter, int *NFDuringBackup_codes, const int sz,
int
FailS_codes[] = {
10023,
10024,
10025,
10026,
10027,
10028,
10029,
10030,
10031
10033
};
int
......@@ -359,9 +358,8 @@ FailM_codes[] = {
10026,
10027,
10028,
10029,
10030,
10031
10031,
10033
};
int
......@@ -426,13 +424,21 @@ NdbBackup::Fail(NdbRestarter& _restarter, int *Fail_codes, const int sz, bool on
if (r == 0) {
g_err << "Backup should have failed on error_insertion " << error << endl
<< "Master = " << masterNodeId << "Node = " << nodeId << endl;
return NDBT_FAILED;
}
CHECK(_restarter.waitClusterStarted() == 0,
"waitClusterStarted failed");
CHECK(_restarter.insertErrorInNode(nodeId, 10099) == 0,
"failed to set error insert");
NdbSleep_SecSleep(5);
int val2[] = { 24, 2424 };
CHECK(_restarter.dumpStateAllNodes(val2, 2) == 0,
"failed to check backup resources RestartOnErrorInsert");
}
return NDBT_OK;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment