Commit 99e84383 authored by unknown's avatar unknown

ndb - bug#20895

  Fix occational LCP hang!!!
  Make sure only to consider alive nodes in startNextChkpt


ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
  Make sure only to consider alive nodes in startNextChkpt
parent 35a202aa
......@@ -9561,73 +9561,84 @@ void Dbdih::startNextChkpt(Signal* signal)
nodePtr.i = replicaPtr.p->procNode;
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
if (replicaPtr.p->lcpOngoingFlag &&
replicaPtr.p->lcpIdStarted < lcpId) {
jam();
//-------------------------------------------------------------------
// We have found a replica on a node that performs local checkpoint
// that is alive and that have not yet been started.
//-------------------------------------------------------------------
if (nodePtr.p->noOfStartedChkpt < 2) {
jam();
/**
* Send LCP_FRAG_ORD to LQH
*/
/**
* Mark the replica so with lcpIdStarted == true
*/
replicaPtr.p->lcpIdStarted = lcpId;
Uint32 i = nodePtr.p->noOfStartedChkpt;
nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
nodePtr.p->noOfStartedChkpt = i + 1;
sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
} else if (nodePtr.p->noOfQueuedChkpt < 2) {
jam();
/**
* Put LCP_FRAG_ORD "in queue"
*/
/**
* Mark the replica so with lcpIdStarted == true
*/
replicaPtr.p->lcpIdStarted = lcpId;
if (c_lcpState.m_participatingLQH.get(nodePtr.i))
{
if (replicaPtr.p->lcpOngoingFlag &&
replicaPtr.p->lcpIdStarted < lcpId)
{
jam();
//-------------------------------------------------------------------
// We have found a replica on a node that performs local checkpoint
// that is alive and that have not yet been started.
//-------------------------------------------------------------------
Uint32 i = nodePtr.p->noOfQueuedChkpt;
nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
nodePtr.p->noOfQueuedChkpt = i + 1;
} else {
jam();
if (nodePtr.p->noOfStartedChkpt < 2)
{
jam();
/**
* Send LCP_FRAG_ORD to LQH
*/
/**
* Mark the replica so with lcpIdStarted == true
*/
replicaPtr.p->lcpIdStarted = lcpId;
if(save){
Uint32 i = nodePtr.p->noOfStartedChkpt;
nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
nodePtr.p->noOfStartedChkpt = i + 1;
sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
}
else if (nodePtr.p->noOfQueuedChkpt < 2)
{
jam();
/**
* Stop increasing value on first that was "full"
* Put LCP_FRAG_ORD "in queue"
*/
c_lcpState.currentFragment = curr;
save = false;
}
busyNodes.set(nodePtr.i);
if(busyNodes.count() == lcpNodes){
/**
* There were no possibility to start the local checkpoint
* and it was not possible to queue it up. In this case we
* stop the start of local checkpoints until the nodes with a
* backlog have performed more checkpoints. We will return and
* will not continue the process of starting any more checkpoints.
* Mark the replica so with lcpIdStarted == true
*/
return;
replicaPtr.p->lcpIdStarted = lcpId;
Uint32 i = nodePtr.p->noOfQueuedChkpt;
nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
nodePtr.p->noOfQueuedChkpt = i + 1;
}
else
{
jam();
if(save)
{
/**
* Stop increasing value on first that was "full"
*/
c_lcpState.currentFragment = curr;
save = false;
}
busyNodes.set(nodePtr.i);
if(busyNodes.count() == lcpNodes)
{
/**
* There were no possibility to start the local checkpoint
* and it was not possible to queue it up. In this case we
* stop the start of local checkpoints until the nodes with a
* backlog have performed more checkpoints. We will return and
* will not continue the process of starting any more checkpoints.
*/
return;
}//if
}//if
}//if
}
}//while
}
}//while
}
curr.fragmentId++;
if (curr.fragmentId >= tabPtr.p->totalfragments) {
jam();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment