Commit 5ed830f6 authored by unknown's avatar unknown

tux optim 13 - replace read keys & query th signals tux->tup by methods

parent 6c73e4d9
......@@ -1014,9 +1014,15 @@ public:
void tuxReadAttrs(Uint32 fragPtrI, Uint32 pageId, Uint32 pageOffset, Uint32 tupVersion, Uint32 numAttrs, const Uint32* attrIds, const Uint32** attrData);
/*
* TUX reads primary key for md5 summing and when returning keyinfo.
* TUX reads primary key without headers into an array of words. Used
* for md5 summing and when returning keyinfo.
*/
void tuxReadKeys(); // under construction
void tuxReadKeys(Uint32 fragPtrI, Uint32 pageId, Uint32 pageOffset, Uint32* pkSize, Uint32* pkData);
/*
* TUX checks if tuple is visible to scan.
*/
bool tuxQueryTh(Uint32 fragPtrI, Uint32 tupAddr, Uint32 tupVersion, Uint32 transId1, Uint32 transId2, Uint32 savePointId);
private:
BLOCK_DEFINES(Dbtup);
......
......@@ -152,10 +152,10 @@ Dbtup::tuxReadAttrs(Uint32 fragPtrI, Uint32 pageId, Uint32 pageOffset, Uint32 tu
const Uint32* tupleHeader = &pagePtr.p->pageWord[pageOffset];
for (Uint32 i = 0; i < numAttrs; i++) {
AttributeHeader ah(attrIds[i]);
Uint32 attrId = ah.getAttributeId();
Uint32 index = tabDescriptor + (attrId << ZAD_LOG_SIZE);
Uint32 desc1 = tableDescriptor[index].tabDescr;
Uint32 desc2 = tableDescriptor[index + 1].tabDescr;
const Uint32 attrId = ah.getAttributeId();
const Uint32 index = tabDescriptor + (attrId << ZAD_LOG_SIZE);
const Uint32 desc1 = tableDescriptor[index].tabDescr;
const Uint32 desc2 = tableDescriptor[index + 1].tabDescr;
if (AttributeDescriptor::getNullable(desc1)) {
Uint32 offset = AttributeOffset::getNullFlagOffset(desc2);
ndbrequire(offset < tablePtr.p->tupNullWords);
......@@ -171,9 +171,78 @@ Dbtup::tuxReadAttrs(Uint32 fragPtrI, Uint32 pageId, Uint32 pageOffset, Uint32 tu
}
}
void // under construction
Dbtup::tuxReadKeys()
void
Dbtup::tuxReadKeys(Uint32 fragPtrI, Uint32 pageId, Uint32 pageOffset, Uint32* pkSize, Uint32* pkData)
{
ljamEntry();
FragrecordPtr fragPtr;
fragPtr.i = fragPtrI;
ptrCheckGuard(fragPtr, cnoOfFragrec, fragrecord);
TablerecPtr tablePtr;
tablePtr.i = fragPtr.p->fragTableId;
ptrCheckGuard(tablePtr, cnoOfTablerec, tablerec);
PagePtr pagePtr;
pagePtr.i = pageId;
ptrCheckGuard(pagePtr, cnoOfPage, page);
const Uint32 tabDescriptor = tablePtr.p->tabDescriptor;
const Uint32 numAttrs = tablePtr.p->noOfKeyAttr;
const Uint32* attrIds = &tableDescriptor[tablePtr.p->readKeyArray].tabDescr;
const Uint32* tupleHeader = &pagePtr.p->pageWord[pageOffset];
Uint32 size = 0;
for (Uint32 i = 0; i < numAttrs; i++) {
AttributeHeader ah(attrIds[i]);
const Uint32 attrId = ah.getAttributeId();
const Uint32 index = tabDescriptor + (attrId << ZAD_LOG_SIZE);
const Uint32 desc1 = tableDescriptor[index].tabDescr;
const Uint32 desc2 = tableDescriptor[index + 1].tabDescr;
ndbrequire(! AttributeDescriptor::getNullable(desc1));
const Uint32 attrSize = AttributeDescriptor::getSizeInWords(desc1);
const Uint32* attrData = tupleHeader + AttributeOffset::getOffset(desc2);
for (Uint32 j = 0; j < attrSize; j++) {
pkData[size + j] = attrData[j];
}
size += attrSize;
}
*pkSize = size;
}
bool
Dbtup::tuxQueryTh(Uint32 fragPtrI, Uint32 tupAddr, Uint32 tupVersion, Uint32 transId1, Uint32 transId2, Uint32 savePointId)
{
ljamEntry();
FragrecordPtr fragPtr;
fragPtr.i = fragPtrI;
ptrCheckGuard(fragPtr, cnoOfFragrec, fragrecord);
TablerecPtr tablePtr;
tablePtr.i = fragPtr.p->fragTableId;
ptrCheckGuard(tablePtr, cnoOfTablerec, tablerec);
// get page
PagePtr pagePtr;
Uint32 fragPageId = tupAddr >> MAX_TUPLES_BITS;
Uint32 pageIndex = tupAddr & ((1 << MAX_TUPLES_BITS ) - 1);
// use temp op rec
Operationrec tempOp;
tempOp.fragPageId = fragPageId;
tempOp.pageIndex = pageIndex;
tempOp.transid1 = transId1;
tempOp.transid2 = transId2;
tempOp.savePointId = savePointId;
tempOp.optype = ZREAD;
tempOp.dirtyOp = 1;
if (getPage(pagePtr, &tempOp, fragPtr.p, tablePtr.p)) {
/*
* We use the normal getPage which will return the tuple to be used
* for this transaction and savepoint id. If its tuple version
* equals the requested then we have a visible tuple otherwise not.
*/
ljam();
Uint32 read_tupVersion = pagePtr.p->pageWord[tempOp.pageOffset + 1];
if (read_tupVersion == tupVersion) {
ljam();
return true;
}
}
return false;
}
// deprecated signal interfaces
......
......@@ -542,49 +542,6 @@ private:
void progError(int line, int cause, const char* file);
};
// parameters for methods
/*
* Copy attribute data.
*/
struct CopyPar {
unsigned m_items; // number of attributes
bool m_headers; // copy headers flag (default true)
unsigned m_maxwords; // limit size (default no limit)
// output
unsigned m_numitems; // number of attributes fully copied
unsigned m_numwords; // number of words copied
CopyPar();
};
/*
* Read index key attributes.
*/
struct ReadPar;
friend struct ReadPar;
struct ReadPar {
TreeEnt m_ent; // tuple to read
unsigned m_first; // first index attribute
unsigned m_count; // number of consecutive index attributes
Data m_data; // set pointer if 0 else copy result to it
unsigned m_size; // number of words (set in read keys only)
ReadPar();
};
/*
* Scan bound comparison.
*/
struct BoundPar;
friend struct BoundPar;
struct BoundPar {
ConstData m_data1; // full bound data
ConstData m_data2; // full or prefix data
unsigned m_count1; // number of bounds
unsigned m_len2; // words in data2 buffer
unsigned m_dir; // 0-lower bound 1-upper bound
BoundPar();
};
// methods
/*
......@@ -596,7 +553,7 @@ private:
// utils
void setKeyAttrs(const Frag& frag);
void readKeyAttrs(const Frag& frag, TreeEnt ent, unsigned start, TableData keyData);
void copyAttrs(Data dst, ConstData src, CopyPar& copyPar);
void readTablePk(const Frag& frag, TreeEnt ent, unsigned& pkSize, Data pkData);
void copyAttrs(const Frag& frag, TableData data1, Data data2, unsigned maxlen2 = MaxAttrDataSize);
/*
......@@ -614,8 +571,6 @@ private:
* DbtuxMaint.cpp
*/
void execTUX_MAINT_REQ(Signal* signal);
void tupReadAttrs(Signal* signal, const Frag& frag, ReadPar& readPar);
void tupReadKeys(Signal* signal, const Frag& frag, ReadPar& readPar);
/*
* DbtuxNode.cpp
......@@ -1225,36 +1180,6 @@ Dbtux::NodeHandle::getMinMax(unsigned i)
// parameters for methods
inline
Dbtux::CopyPar::CopyPar() :
m_items(0),
m_headers(true),
m_maxwords(~0), // max unsigned
// output
m_numitems(0),
m_numwords(0)
{
}
inline
Dbtux::ReadPar::ReadPar() :
m_first(0),
m_count(0),
m_data(0),
m_size(0)
{
}
inline
Dbtux::BoundPar::BoundPar() :
m_data1(0),
m_data2(0),
m_count1(0),
m_len2(0),
m_dir(255)
{
}
#ifdef VM_TRACE
inline
Dbtux::PrintPar::PrintPar() :
......
......@@ -246,37 +246,14 @@ Dbtux::readKeyAttrs(const Frag& frag, TreeEnt ent, unsigned start, TableData key
}
void
Dbtux::copyAttrs(Data dst, ConstData src, CopyPar& copyPar)
Dbtux::readTablePk(const Frag& frag, TreeEnt ent, unsigned& pkSize, Data pkData)
{
CopyPar c = copyPar;
c.m_numitems = 0;
c.m_numwords = 0;
while (c.m_numitems < c.m_items) {
jam();
if (c.m_headers) {
unsigned i = 0;
while (i < AttributeHeaderSize) {
if (c.m_numwords >= c.m_maxwords) {
copyPar = c;
return;
}
dst[c.m_numwords++] = src[i++];
}
}
unsigned size = src.ah().getDataSize();
src += AttributeHeaderSize;
unsigned i = 0;
while (i < size) {
if (c.m_numwords >= c.m_maxwords) {
copyPar = c;
return;
}
dst[c.m_numwords++] = src[i++];
}
src += size;
c.m_numitems++;
}
copyPar = c;
const Uint32 tableFragPtrI = frag.m_tupTableFragPtrI[ent.m_fragBit];
const TupLoc tupLoc = ent.m_tupLoc;
Uint32 size = 0;
c_tup->tuxReadKeys(tableFragPtrI, tupLoc.m_pageId, tupLoc.m_pageOffset, &size, pkData);
ndbrequire(size != 0);
pkSize = size;
}
/*
......
......@@ -180,89 +180,3 @@ Dbtux::execTUX_MAINT_REQ(Signal* signal)
// copy back
*sig = *req;
}
/*
* Read index key attributes from TUP. If buffer is provided the data
* is copied to it. Otherwise pointer is set to signal data.
*/
void
Dbtux::tupReadAttrs(Signal* signal, const Frag& frag, ReadPar& readPar)
{
// define the direct signal
const TreeEnt ent = readPar.m_ent;
TupReadAttrs* const req = (TupReadAttrs*)signal->getDataPtrSend();
req->errorCode = RNIL;
req->requestInfo = 0;
req->tableId = frag.m_tableId;
req->fragId = frag.m_fragId | (ent.m_fragBit << frag.m_fragOff);
req->fragPtrI = frag.m_tupTableFragPtrI[ent.m_fragBit];
req->tupAddr = (Uint32)-1;
req->tupVersion = ent.m_tupVersion;
req->pageId = ent.m_tupLoc.m_pageId;
req->pageOffset = ent.m_tupLoc.m_pageOffset;
req->bufferId = 0;
// add count and list of attribute ids
Data data = (Uint32*)req + TupReadAttrs::SignalLength;
data[0] = readPar.m_count;
data += 1;
const DescEnt& descEnt = getDescEnt(frag.m_descPage, frag.m_descOff);
for (Uint32 i = 0; i < readPar.m_count; i++) {
jam();
const DescAttr& descAttr = descEnt.m_descAttr[readPar.m_first + i];
data.ah() = AttributeHeader(descAttr.m_primaryAttrId, 0);
data += 1;
}
// execute
EXECUTE_DIRECT(DBTUP, GSN_TUP_READ_ATTRS, signal, TupReadAttrs::SignalLength);
jamEntry();
ndbrequire(req->errorCode == 0);
// data is at output
if (readPar.m_data == 0) {
readPar.m_data = data;
} else {
jam();
CopyPar copyPar;
copyPar.m_items = readPar.m_count;
copyPar.m_headers = true;
copyAttrs(readPar.m_data, data, copyPar);
}
}
/*
* Read primary keys. Copy the data without attribute headers into the
* given buffer. Number of words is returned in ReadPar argument.
*/
void
Dbtux::tupReadKeys(Signal* signal, const Frag& frag, ReadPar& readPar)
{
// define the direct signal
const TreeEnt ent = readPar.m_ent;
TupReadAttrs* const req = (TupReadAttrs*)signal->getDataPtrSend();
req->errorCode = RNIL;
req->requestInfo = TupReadAttrs::ReadKeys;
req->tableId = frag.m_tableId;
req->fragId = frag.m_fragId | (ent.m_fragBit << frag.m_fragOff);
req->fragPtrI = frag.m_tupTableFragPtrI[ent.m_fragBit];
req->tupAddr = (Uint32)-1;
req->tupVersion = RNIL; // not used
req->pageId = ent.m_tupLoc.m_pageId;
req->pageOffset = ent.m_tupLoc.m_pageOffset;
req->bufferId = 0;
// execute
EXECUTE_DIRECT(DBTUP, GSN_TUP_READ_ATTRS, signal, TupReadAttrs::SignalLength);
jamEntry();
ndbrequire(req->errorCode == 0);
// copy out in special format
ConstData data = (Uint32*)req + TupReadAttrs::SignalLength;
const Uint32 numKeys = data[0];
data += 1 + numKeys;
// copy out without headers
ndbrequire(readPar.m_data != 0);
CopyPar copyPar;
copyPar.m_items = numKeys;
copyPar.m_headers = false;
copyAttrs(readPar.m_data, data, copyPar);
// return counts
readPar.m_count = numKeys;
readPar.m_size = copyPar.m_numwords;
}
......@@ -379,8 +379,8 @@ Dbtux::execACC_CHECK_SCAN(Signal* signal)
scanNext(signal, scanPtr);
}
// for reading tuple key in Current or Locked state
ReadPar keyPar;
keyPar.m_data = 0; // indicates not yet done
Data pkData = c_dataBuffer;
unsigned pkSize = 0; // indicates not yet done
if (scan.m_state == ScanOp::Current) {
// found an entry to return
jam();
......@@ -389,9 +389,7 @@ Dbtux::execACC_CHECK_SCAN(Signal* signal)
jam();
const TreeEnt ent = scan.m_scanPos.m_ent;
// read tuple key
keyPar.m_ent = ent;
keyPar.m_data = c_dataBuffer;
tupReadKeys(signal, frag, keyPar);
readTablePk(frag, ent, pkSize, pkData);
// get read lock or exclusive lock
AccLockReq* const lockReq = (AccLockReq*)signal->getDataPtrSend();
lockReq->returnCode = RNIL;
......@@ -403,9 +401,9 @@ Dbtux::execACC_CHECK_SCAN(Signal* signal)
lockReq->tableId = scan.m_tableId;
lockReq->fragId = frag.m_fragId | (ent.m_fragBit << frag.m_fragOff);
lockReq->fragPtrI = frag.m_accTableFragPtrI[ent.m_fragBit];
const Uint32* const buf32 = static_cast<Uint32*>(keyPar.m_data);
const Uint32* const buf32 = static_cast<Uint32*>(pkData);
const Uint64* const buf64 = reinterpret_cast<const Uint64*>(buf32);
lockReq->hashValue = md5_hash(buf64, keyPar.m_size);
lockReq->hashValue = md5_hash(buf64, pkSize);
lockReq->tupAddr = getTupAddr(frag, ent);
lockReq->transId1 = scan.m_transId1;
lockReq->transId2 = scan.m_transId2;
......@@ -480,11 +478,9 @@ Dbtux::execACC_CHECK_SCAN(Signal* signal)
const TreeEnt ent = scan.m_scanPos.m_ent;
if (scan.m_keyInfo) {
jam();
if (keyPar.m_data == 0) {
if (pkSize == 0) {
jam();
keyPar.m_ent = ent;
keyPar.m_data = c_dataBuffer;
tupReadKeys(signal, frag, keyPar);
readTablePk(frag, ent, pkSize, pkData);
}
}
// conf signal
......@@ -510,10 +506,10 @@ Dbtux::execACC_CHECK_SCAN(Signal* signal)
// add key info
if (scan.m_keyInfo) {
jam();
conf->keyLength = keyPar.m_size;
conf->keyLength = pkSize;
// piggy-back first 4 words of key data
for (unsigned i = 0; i < 4; i++) {
conf->key[i] = i < keyPar.m_size ? keyPar.m_data[i] : 0;
conf->key[i] = i < pkSize ? pkData[i] : 0;
}
signalLength = 11;
}
......@@ -525,18 +521,18 @@ Dbtux::execACC_CHECK_SCAN(Signal* signal)
EXECUTE_DIRECT(blockNo, GSN_NEXT_SCANCONF, signal, signalLength);
}
// send rest of key data
if (scan.m_keyInfo && keyPar.m_size > 4) {
if (scan.m_keyInfo && pkSize > 4) {
unsigned total = 4;
while (total < keyPar.m_size) {
while (total < pkSize) {
jam();
unsigned length = keyPar.m_size - total;
unsigned length = pkSize - total;
if (length > 20)
length = 20;
signal->theData[0] = scan.m_userPtr;
signal->theData[1] = 0;
signal->theData[2] = 0;
signal->theData[3] = length;
memcpy(&signal->theData[4], &keyPar.m_data[total], length << 2);
memcpy(&signal->theData[4], &pkData[total], length << 2);
sendSignal(scan.m_userRef, GSN_ACC_SCAN_INFO24,
signal, 4 + length, JBB);
total += length;
......@@ -895,35 +891,25 @@ Dbtux::scanNext(Signal* signal, ScanOpPtr scanPtr)
bool
Dbtux::scanVisible(Signal* signal, ScanOpPtr scanPtr, TreeEnt ent)
{
TupQueryTh* const req = (TupQueryTh*)signal->getDataPtrSend();
const ScanOp& scan = *scanPtr.p;
const Frag& frag = *c_fragPool.getPtr(scan.m_fragPtrI);
/* Assign table, fragment, tuple address + version */
Uint32 tableId = frag.m_tableId;
Uint32 fragBit = ent.m_fragBit;
Uint32 tableFragPtrI = frag.m_tupTableFragPtrI[fragBit];
Uint32 fragId = frag.m_fragId | (fragBit << frag.m_fragOff);
Uint32 tupAddr = getTupAddr(frag, ent);
Uint32 tupVersion = ent.m_tupVersion;
/* Check for same tuple twice in row */
// check for same tuple twice in row
if (scan.m_lastEnt.m_tupLoc == ent.m_tupLoc &&
scan.m_lastEnt.m_fragBit == fragBit) {
jam();
return false;
}
req->tableId = tableId;
req->fragId = fragId;
req->tupAddr = tupAddr;
req->tupVersion = tupVersion;
/* Assign transaction info, trans id + savepoint id */
Uint32 transId1 = scan.m_transId1;
Uint32 transId2 = scan.m_transId2;
Uint32 savePointId = scan.m_savePointId;
req->transId1 = transId1;
req->transId2 = transId2;
req->savePointId = savePointId;
EXECUTE_DIRECT(DBTUP, GSN_TUP_QUERY_TH, signal, TupQueryTh::SignalLength);
bool ret = c_tup->tuxQueryTh(tableFragPtrI, tupAddr, tupVersion, transId1, transId2, savePointId);
jamEntry();
return (bool)req->returnCode;
return ret;
}
/*
......
index maintenance overhead
==========================
ordered index performance
=========================
"mc02" 2x1700 MHz linux-2.4.9 gcc-2.96 -O3 one db-node
case a: index on Unsigned
testOIBasic -case u -table 1 -index 1 -fragtype small -threads 10 -rows 100000 -subloop 1 -nologging
case a: maintenance: index on Unsigned
testOIBasic -case u -table 1 -index 2 -fragtype small -threads 10 -rows 100000 -subloop 1 -nologging
case b: index on Varchar(5) + Varchar(5) + Varchar(20) + Unsigned
testOIBasic -case u -table 2 -index 4 -fragtype small -threads 10 -rows 100000 -subloop 1 -nologging
case b: maintenance: index on Varchar(5) + Varchar(5) + Varchar(20) + Unsigned
testOIBasic -case u -table 2 -index 5 -fragtype small -threads 10 -rows 100000 -subloop 1 -nologging
case c: full scan: index on PK Unsigned
testOIBasic -case v -table 1 -index 1 -fragtype small -threads 10 -rows 100000 -subloop 1 -nologging
case d: scan 1 tuple via EQ: index on PK Unsigned
testOIBasic -case w -table 1 -index 1 -fragtype small -threads 10 -rows 100000 -samples 10000 -subloop 1 -nologging -v2
a, b
1 million rows, pk update without index, pk update with index
shows ms / 1000 rows for each and pct overhead
the figures are based on single run on idle machine
c
1 million rows, index on PK, full table scan, full index scan
shows ms / 1000 rows for each and index time pct
d
1 million rows, index on PK, read table via each pk, scan index for each pk
shows ms / 1000 rows for each and index time pct
samples 10% of all PKs (100,000 pk reads, 100,000 scans)
040616 mc02/a 40 ms 87 ms 114 pct
mc02/b 51 ms 128 ms 148 pct
......@@ -51,5 +66,12 @@ optim 11 mc02/a 43 ms 63 ms 46 pct
optim 12 mc02/a 38 ms 55 ms 43 pct
mc02/b 47 ms 77 ms 63 pct
mc02/c 10 ms 14 ms 147 pct
mc02/d 176 ms 281 ms 159 pct
optim 13 mc02/a 40 ms 57 ms 42 pct
mc02/b 47 ms 77 ms 61 pct
mc02/c 9 ms 13 ms 150 pct
mc02/d 170 ms 256 ms 150 pct
vim: set et:
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment