ndb - Add possibility to limit disk write speed in backup(lcp)

      3 new paramters:
      DiskSyncSize - Outstanding disk writes before sync (default 4M)
      DiskCheckpointSpeed - Write speed of LCP in bytes/sec (default 10M)
      DiskCheckpointSpeedInRestart - As above but during LCP (default 100M)

      Depricated old NoOfDiskPagesToDisk*

    - Change NoOfFragmentLogFiles default to 16 (1Gb)
parent 7c29f7b6
......@@ -32,7 +32,8 @@ private:
BUFFER_FULL_SCAN = 2,
BUFFER_FULL_FRAG_COMPLETE = 3,
BUFFER_FULL_META = 4,
BACKUP_FRAGMENT_INFO = 5
BACKUP_FRAGMENT_INFO = 5,
RESET_DISK_SPEED_COUNTER = 6
};
};
......
......@@ -92,6 +92,10 @@
#define CFG_DB_DISK_PAGE_BUFFER_MEMORY 160
#define CFG_DB_STRING_MEMORY 161
#define CFG_DB_DISK_SYNCH_SIZE 163
#define CFG_DB_CHECKPOINT_SPEED 164
#define CFG_DB_CHECKPOINT_SPEED_SR 165
#define CFG_DB_SGA 198 /* super pool mem */
#define CFG_DB_DATA_MEM_2 199 /* used in special build in 5.1 */
......
......@@ -84,6 +84,16 @@ Backup::execSTTOR(Signal* signal)
const Uint32 startphase = signal->theData[1];
const Uint32 typeOfStart = signal->theData[7];
if (startphase == 1)
{
m_curr_disk_write_speed = c_defaults.m_disk_write_speed_sr;
m_overflow_disk_write = 0;
m_reset_disk_speed_time = NdbTick_CurrentMillisecond();
m_reset_delay_used = Backup::DISK_SPEED_CHECK_DELAY;
signal->theData[0] = BackupContinueB::RESET_DISK_SPEED_COUNTER;
sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal,
Backup::DISK_SPEED_CHECK_DELAY, 1);
}
if (startphase == 3) {
jam();
g_TypeOfStart = typeOfStart;
......@@ -92,6 +102,11 @@ Backup::execSTTOR(Signal* signal)
return;
}//if
if (startphase == 7)
{
m_curr_disk_write_speed = c_defaults.m_disk_write_speed;
}
if(startphase == 7 && g_TypeOfStart == NodeState::ST_INITIAL_START &&
c_masterNodeId == getOwnNodeId()){
jam();
......@@ -170,6 +185,42 @@ Backup::execCONTINUEB(Signal* signal)
const Uint32 Tdata2 = signal->theData[2];
switch(Tdata0) {
case BackupContinueB::RESET_DISK_SPEED_COUNTER:
{
/*
Adjust for upto 10 millisecond delay of this signal. Longer
delays will not be handled, in this case the system is most
likely under too high load and it won't matter very much that
we decrease the speed of checkpoints.
We use a technique where we allow an overflow write in one
period. This overflow will be removed from the next period
such that the load will at average be as specified.
*/
int delay_time = m_reset_delay_used;
NDB_TICKS curr_time = NdbTick_CurrentMillisecond();
int sig_delay = curr_time - m_reset_disk_speed_time;
m_words_written_this_period = m_overflow_disk_write;
m_overflow_disk_write = 0;
m_reset_disk_speed_time = curr_time;
if (sig_delay > delay_time + 10)
delay_time = Backup::DISK_SPEED_CHECK_DELAY - 10;
else if (sig_delay < delay_time - 10)
delay_time = Backup::DISK_SPEED_CHECK_DELAY + 10;
else
delay_time = Backup::DISK_SPEED_CHECK_DELAY - (sig_delay - delay_time);
m_reset_delay_used= delay_time;
signal->theData[0] = BackupContinueB::RESET_DISK_SPEED_COUNTER;
sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, delay_time, 1);
#if 0
ndbout << "Signal delay was = " << sig_delay;
ndbout << " Current time = " << curr_time << endl;
ndbout << " Delay time will be = " << delay_time << endl << endl;
#endif
break;
}
case BackupContinueB::BACKUP_FRAGMENT_INFO:
{
const Uint32 ptr_I = Tdata1;
......@@ -202,8 +253,8 @@ Backup::execCONTINUEB(Signal* signal)
fragInfo->FragmentNo = htonl(fragPtr_I);
fragInfo->NoOfRecordsLow = htonl(fragPtr.p->noOfRecords & 0xFFFFFFFF);
fragInfo->NoOfRecordsHigh = htonl(fragPtr.p->noOfRecords >> 32);
fragInfo->FilePosLow = htonl(0 & 0xFFFFFFFF);
fragInfo->FilePosHigh = htonl(0 >> 32);
fragInfo->FilePosLow = htonl(0);
fragInfo->FilePosHigh = htonl(0);
filePtr.p->operation.dataBuffer.updateWritePtr(sz);
......@@ -938,7 +989,7 @@ Backup::execBACKUP_REQ(Signal* signal)
return;
}//if
if (m_diskless)
if (c_defaults.m_diskless)
{
sendBackupRef(senderRef, flags, signal, senderData,
BackupRef::CannotBackupDiskless);
......@@ -2610,9 +2661,10 @@ Backup::openFiles(Signal* signal, BackupRecordPtr ptr)
FsOpenReq::OM_WRITEONLY |
FsOpenReq::OM_TRUNCATE |
FsOpenReq::OM_CREATE |
FsOpenReq::OM_APPEND;
FsOpenReq::OM_APPEND |
FsOpenReq::OM_AUTOSYNC;
FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
req->auto_sync_size = c_defaults.m_disk_synch_size;
/**
* Ctl file
*/
......@@ -3881,6 +3933,69 @@ Backup::execFSAPPENDCONF(Signal* signal)
checkFile(signal, filePtr);
}
/*
This routine handles two problems with writing to disk during local
checkpoints and backups. The first problem is that we need to limit
the writing to ensure that we don't use too much CPU and disk resources
for backups and checkpoints. The perfect solution to this is to use
a dynamic algorithm that adapts to the environment. Until we have
implemented this we can satisfy ourselves with an algorithm that
uses a configurable limit.
The second problem is that in Linux we can get severe problems if we
write very much to the disk without synching. In the worst case we
can have Gigabytes of data in the Linux page cache before we reach
the limit of how much we can write. If this happens the performance
will drop significantly when we reach this limit since the Linux flush
daemon will spend a few minutes on writing out the page cache to disk.
To avoid this we ensure that a file never have more than a certain
amount of data outstanding before synch. This variable is also
configurable.
*/
bool
Backup::ready_to_write(bool ready, Uint32 sz, bool eof, BackupFile *fileP)
{
#if 0
ndbout << "ready_to_write: ready = " << ready << " eof = " << eof;
ndbout << " sz = " << sz << endl;
ndbout << "words this period = " << m_words_written_this_period;
ndbout << endl << "overflow disk write = " << m_overflow_disk_write;
ndbout << endl << "Current Millisecond is = ";
ndbout << NdbTick_CurrentMillisecond() << endl;
#endif
if ((ready || eof) &&
m_words_written_this_period <= m_curr_disk_write_speed)
{
/*
We have a buffer ready to write or we have reached end of
file and thus we must write the last before closing the
file.
We have already check that we are allowed to write at this
moment. We only worry about history of last 100 milliseconds.
What happened before that is of no interest since a disk
write that was issued more than 100 milliseconds should be
completed by now.
*/
int overflow;
m_words_written_this_period += sz;
overflow = m_words_written_this_period - m_curr_disk_write_speed;
if (overflow > 0)
m_overflow_disk_write = overflow;
#if 0
ndbout << "Will write with " << endl;
ndbout << endl;
#endif
return true;
}
else
{
#if 0
ndbout << "Will not write now" << endl << endl;
#endif
return false;
}
}
void
Backup::checkFile(Signal* signal, BackupFilePtr filePtr)
{
......@@ -3890,35 +4005,23 @@ Backup::checkFile(Signal* signal, BackupFilePtr filePtr)
#endif
OperationRecord & op = filePtr.p->operation;
Uint32 * tmp, sz; bool eof;
if(op.dataBuffer.getReadPtr(&tmp, &sz, &eof))
Uint32 *tmp = NULL;
Uint32 sz = 0;
bool eof = FALSE;
bool ready = op.dataBuffer.getReadPtr(&tmp, &sz, &eof);
#if 0
ndbout << "Ptr to data = " << hex << tmp << endl;
#endif
if (!ready_to_write(ready, sz, eof, filePtr.p))
{
jam();
jam();
FsAppendReq * req = (FsAppendReq *)signal->getDataPtrSend();
req->filePointer = filePtr.p->filePointer;
req->userPointer = filePtr.i;
req->userReference = reference();
req->varIndex = 0;
req->offset = tmp - c_startOfPages;
req->size = sz;
sendSignal(NDBFS_REF, GSN_FSAPPENDREQ, signal,
FsAppendReq::SignalLength, JBA);
return;
}
if(!eof) {
jam();
signal->theData[0] = BackupContinueB::BUFFER_UNDERFLOW;
signal->theData[1] = filePtr.i;
sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, 50, 2);
sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, 20, 2);
return;
}//if
if(sz > 0) {
}
else if (sz > 0)
{
jam();
FsAppendReq * req = (FsAppendReq *)signal->getDataPtrSend();
req->filePointer = filePtr.p->filePointer;
......@@ -3926,13 +4029,14 @@ Backup::checkFile(Signal* signal, BackupFilePtr filePtr)
req->userReference = reference();
req->varIndex = 0;
req->offset = tmp - c_startOfPages;
req->size = sz; // Round up
req->size = sz;
req->synch_flag = 0;
sendSignal(NDBFS_REF, GSN_FSAPPENDREQ, signal,
FsAppendReq::SignalLength, JBA);
return;
}//if
}
#ifdef DEBUG_ABORT
Uint32 running= filePtr.p->fileRunning;
Uint32 closing= filePtr.p->fileClosing;
......@@ -4214,16 +4318,15 @@ Backup::closeFiles(Signal* sig, BackupRecordPtr ptr)
continue;
}//if
filePtr.p->operation.dataBuffer.eof();
if(filePtr.p->fileRunning == 1){
jam();
#ifdef DEBUG_ABORT
ndbout_c("Close files fileRunning == 1, filePtr.i=%u", filePtr.i);
#endif
filePtr.p->operation.dataBuffer.eof();
} else {
jam();
filePtr.p->fileClosing = 1;
filePtr.p->operation.dataBuffer.eof();
checkFile(sig, filePtr); // make sure we write everything before closing
FsCloseReq * req = (FsCloseReq *)sig->getDataPtrSend();
......@@ -4712,8 +4815,10 @@ Backup::lcp_open_file(Signal* signal, BackupRecordPtr ptr)
FsOpenReq::OM_WRITEONLY |
FsOpenReq::OM_TRUNCATE |
FsOpenReq::OM_CREATE |
FsOpenReq::OM_APPEND;
FsOpenReq::OM_APPEND |
FsOpenReq::OM_AUTOSYNC;
FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF);
req->auto_sync_size = c_defaults.m_disk_synch_size;
TablePtr tabPtr;
FragmentPtr fragPtr;
......
......@@ -33,6 +33,7 @@
#include <blocks/mutexes.hpp>
#include <NdbTCP.h>
#include <NdbTick.h>
#include <Array.hpp>
/**
......@@ -522,6 +523,11 @@ public:
Uint32 m_minWriteSize;
Uint32 m_maxWriteSize;
Uint32 m_lcp_buffer_size;
Uint32 m_disk_write_speed_sr;
Uint32 m_disk_write_speed;
Uint32 m_disk_synch_size;
Uint32 m_diskless;
};
/**
......@@ -533,8 +539,17 @@ public:
NdbNodeBitmask c_aliveNodes;
DLList<BackupRecord> c_backups;
Config c_defaults;
Uint32 m_diskless;
/*
Variables that control checkpoint to disk speed
*/
Uint32 m_curr_disk_write_speed;
Uint32 m_words_written_this_period;
Uint32 m_overflow_disk_write;
Uint32 m_reset_delay_used;
NDB_TICKS m_reset_disk_speed_time;
static const int DISK_SPEED_CHECK_DELAY = 100;
STATIC_CONST(NO_OF_PAGES_META_FILE = MAX_WORDS_META_FILE/BACKUP_WORDS_PER_PAGE);
/**
......@@ -631,6 +646,8 @@ public:
void lcp_open_file_done(Signal*, BackupRecordPtr);
void lcp_close_file_conf(Signal* signal, BackupRecordPtr);
void lcp_send_end_lcp_conf(Signal* signal, BackupRecordPtr);
bool ready_to_write(bool ready, Uint32 sz, bool eof, BackupFile *fileP);
};
inline
......
......@@ -146,8 +146,28 @@ Backup::execREAD_CONFIG_REQ(Signal* signal)
m_ctx.m_config.getOwnConfigIterator();
ndbrequire(p != 0);
c_defaults.m_disk_write_speed = 10 * (1024 * 1024);
c_defaults.m_disk_write_speed_sr = 100 * (1024 * 1024);
c_defaults.m_disk_synch_size = 4 * (1024 * 1024);
Uint32 noBackups = 0, noTables = 0, noAttribs = 0, noFrags = 0;
ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, &m_diskless));
ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS,
&c_defaults.m_diskless));
ndb_mgm_get_int_parameter(p, CFG_DB_CHECKPOINT_SPEED_SR,
&c_defaults.m_disk_write_speed_sr);
ndb_mgm_get_int_parameter(p, CFG_DB_CHECKPOINT_SPEED,
&c_defaults.m_disk_write_speed);
ndb_mgm_get_int_parameter(p, CFG_DB_DISK_SYNCH_SIZE,
&c_defaults.m_disk_synch_size);
/*
We adjust the disk speed parameters from bytes per second to rather be
words per 100 milliseconds. We convert disk synch size from bytes per
second to words per second.
*/
c_defaults.m_disk_write_speed /= (4 * 10);
c_defaults.m_disk_write_speed_sr /= (4 * 10);
ndb_mgm_get_int_parameter(p, CFG_DB_PARALLEL_BACKUPS, &noBackups);
// ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_NO_TABLES, &noTables));
ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DICT_TABLE, &noTables));
......
......@@ -877,7 +877,7 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
ConfigInfo::CI_USED,
false,
ConfigInfo::CI_INT,
"8",
"16",
"3",
STR_VALUE(MAX_INT_RNIL) },
......@@ -952,8 +952,8 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
CFG_DB_LCP_DISC_PAGES_TUP_SR,
"NoOfDiskPagesToDiskDuringRestartTUP",
DB_TOKEN,
"?",
ConfigInfo::CI_USED,
"DiskCheckpointSpeedSr",
ConfigInfo::CI_DEPRICATED,
true,
ConfigInfo::CI_INT,
"40",
......@@ -964,8 +964,8 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
CFG_DB_LCP_DISC_PAGES_TUP,
"NoOfDiskPagesToDiskAfterRestartTUP",
DB_TOKEN,
"?",
ConfigInfo::CI_USED,
"DiskCheckpointSpeed",
ConfigInfo::CI_DEPRICATED,
true,
ConfigInfo::CI_INT,
"40",
......@@ -976,8 +976,8 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
CFG_DB_LCP_DISC_PAGES_ACC_SR,
"NoOfDiskPagesToDiskDuringRestartACC",
DB_TOKEN,
"?",
ConfigInfo::CI_USED,
"DiskCheckpointSpeedSr",
ConfigInfo::CI_DEPRICATED,
true,
ConfigInfo::CI_INT,
"20",
......@@ -988,8 +988,8 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
CFG_DB_LCP_DISC_PAGES_ACC,
"NoOfDiskPagesToDiskAfterRestartACC",
DB_TOKEN,
"?",
ConfigInfo::CI_USED,
"DiskCheckpointSpeed",
ConfigInfo::CI_DEPRICATED,
true,
ConfigInfo::CI_INT,
"20",
......@@ -1191,6 +1191,42 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = {
UNDEFINED,
0, 0 },
{
CFG_DB_DISK_SYNCH_SIZE,
"DiskSyncSize",
DB_TOKEN,
"Data written to a file before a synch is forced",
ConfigInfo::CI_USED,
false,
ConfigInfo::CI_INT,
"4M",
"32k",
STR_VALUE(MAX_INT_RNIL) },
{
CFG_DB_CHECKPOINT_SPEED,
"DiskCheckpointSpeed",
DB_TOKEN,
"Bytes per second allowed to be written by checkpoint",
ConfigInfo::CI_USED,
false,
ConfigInfo::CI_INT,
"10M",
"1M",
STR_VALUE(MAX_INT_RNIL) },
{
CFG_DB_CHECKPOINT_SPEED_SR,
"DiskCheckpointSpeedInRestart",
DB_TOKEN,
"Bytes per second allowed to be written by checkpoint during restart",
ConfigInfo::CI_USED,
false,
ConfigInfo::CI_INT,
"100M",
"1M",
STR_VALUE(MAX_INT_RNIL) },
{
CFG_DB_BACKUP_MEM,
"BackupMemory",
......
......@@ -655,6 +655,18 @@ InitConfigFileParser::store_in_properties(Vector<struct my_option>& options,
m_info->getMax(ctx.m_currentInfo, fname));
return false;
}
ConfigInfo::Status status = m_info->getStatus(ctx.m_currentInfo, fname);
if (status == ConfigInfo::CI_DEPRICATED) {
const char * desc = m_info->getDescription(ctx.m_currentInfo, fname);
if(desc && desc[0]){
ctx.reportWarning("[%s] %s is depricated, use %s instead",
ctx.fname, fname, desc);
} else if (desc == 0){
ctx.reportWarning("[%s] %s is depricated", ctx.fname, fname);
}
}
if (options[i].var_type == GET_INT)
ctx.m_currentSection->put(options[i].name, (Uint32)value_int);
else
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment