Commit babe38a4 authored by unknown's avatar unknown

Bug #28525 Node failures in PGMAN at ndbrequire (line 430)


storage/ndb/src/kernel/blocks/pgman.cpp:
  Under heavy insert PGMAN can run out of page entries
  even when set to 100 times page cache entries.
  
  In this use pattern the extra entries remain idle
  on LIRS stack.  Only ONSTACK is set.  There is not
  enough activity to free them the normal way.
  
  A study of PGMAN / DBTUP behaviour is needed.
  
  This patch adds new sublist SL_IDLE.  When page entry
  pool is empty, an idle entry is released from SL_IDLE
  front if there is any.  Otherwise, we still crash.
  
  The factor above is set from 100 to 10 (still high).
storage/ndb/src/kernel/blocks/pgman.hpp:
  Under heavy insert PGMAN can run out of page entries
  even when set to 100 times page cache entries.
  
  In this use pattern the extra entries remain idle
  on LIRS stack.  Only ONSTACK is set.  There is not
  enough activity to free them the normal way.
  
  A study of PGMAN / DBTUP behaviour is needed.
  
  This patch adds new sublist SL_IDLE.  When page entry
  pool is empty, an idle entry is released from SL_IDLE
  front if there is any.  Otherwise, we still crash.
  
  The factor above is set from 100 to 10 (still high).
parent bf699d71
......@@ -123,8 +123,8 @@ Pgman::execREAD_CONFIG_REQ(Signal* signal)
if (page_buffer > 0)
{
page_buffer /= GLOBAL_PAGE_SIZE; // in pages
m_page_entry_pool.setSize(100*page_buffer);
m_param.m_max_pages = page_buffer;
m_page_entry_pool.setSize(m_param.m_lirs_stack_mult * page_buffer);
m_param.m_max_hot_pages = (page_buffer * 9) / 10;
}
......@@ -141,6 +141,7 @@ Pgman::execREAD_CONFIG_REQ(Signal* signal)
Pgman::Param::Param() :
m_max_pages(64), // smallish for testing
m_lirs_stack_mult(10),
m_max_hot_pages(56),
m_max_loop_count(256),
m_max_io_waits(64),
......@@ -301,6 +302,9 @@ Pgman::get_sublist_no(Page_state state)
{
return Page_entry::SL_LOCKED;
}
if (state == Page_entry::ONSTACK) {
return Page_entry::SL_IDLE;
}
return Page_entry::SL_OTHER;
}
......@@ -415,15 +419,55 @@ Pgman::get_page_entry(Ptr<Page_entry>& ptr, Uint32 file_no, Uint32 page_no)
{
if (find_page_entry(ptr, file_no, page_no))
{
jam();
ndbrequire(ptr.p->m_state != 0);
m_stats.m_page_hits++;
#ifdef VM_TRACE
debugOut << "PGMAN: get_page_entry: found" << endl;
debugOut << "PGMAN: " << ptr << endl;
#endif
return true;
}
if (m_page_entry_pool.getNoOfFree() == 0)
{
jam();
Page_sublist& pl_idle = *m_page_sublist[Page_entry::SL_IDLE];
Ptr<Page_entry> idle_ptr;
if (pl_idle.first(idle_ptr))
{
jam();
#ifdef VM_TRACE
debugOut << "PGMAN: get_page_entry: re-use idle entry" << endl;
debugOut << "PGMAN: " << idle_ptr << endl;
#endif
Page_state state = idle_ptr.p->m_state;
ndbrequire(state == Page_entry::ONSTACK);
Page_stack& pl_stack = m_page_stack;
ndbrequire(pl_stack.hasPrev(idle_ptr));
pl_stack.remove(idle_ptr);
state &= ~ Page_entry::ONSTACK;
set_page_state(idle_ptr, state);
ndbrequire(idle_ptr.p->m_state == 0);
release_page_entry(idle_ptr);
}
}
if (seize_page_entry(ptr, file_no, page_no))
{
jam();
ndbrequire(ptr.p->m_state == 0);
m_stats.m_page_faults++;
#ifdef VM_TRACE
debugOut << "PGMAN: get_page_entry: seize" << endl;
debugOut << "PGMAN: " << ptr << endl;
#endif
return true;
}
......@@ -1929,6 +1973,8 @@ Pgman::verify_page_entry(Ptr<Page_entry> ptr)
break;
case Page_entry::SL_LOCKED:
break;
case Page_entry::SL_IDLE:
break;
case Page_entry::SL_OTHER:
break;
default:
......@@ -1975,8 +2021,11 @@ Pgman::verify_page_lists()
ndbrequire(stack_count == pl_stack.count() || dump_page_lists());
ndbrequire(queue_count == pl_queue.count() || dump_page_lists());
Uint32 hot_count = 0;
Uint32 hot_bound_count = 0;
Uint32 cold_bound_count = 0;
Uint32 stack_request_count = 0;
Uint32 queue_request_count = 0;
Uint32 i1 = RNIL;
for (pl_stack.first(ptr); ptr.i != RNIL; pl_stack.next(ptr))
......@@ -1987,9 +2036,13 @@ Pgman::verify_page_lists()
ndbrequire(state & Page_entry::ONSTACK || dump_page_lists());
if (! pl_stack.hasPrev(ptr))
ndbrequire(state & Page_entry::HOT || dump_page_lists());
if (state & Page_entry::HOT &&
state & Page_entry::BOUND)
hot_bound_count++;
if (state & Page_entry::HOT) {
hot_count++;
if (state & Page_entry::BOUND)
hot_bound_count++;
}
if (state & Page_entry::REQUEST)
stack_request_count++;
}
Uint32 i2 = RNIL;
......@@ -2001,6 +2054,8 @@ Pgman::verify_page_lists()
ndbrequire(state & Page_entry::ONQUEUE || dump_page_lists());
ndbrequire(state & Page_entry::BOUND || dump_page_lists());
cold_bound_count++;
if (state & Page_entry::REQUEST)
queue_request_count++;
}
Uint32 tot_bound_count =
......@@ -2033,7 +2088,11 @@ Pgman::verify_page_lists()
<< " cache:" << m_stats.m_num_pages
<< "(" << locked_bound_count << "L)"
<< " stack:" << pl_stack.count()
<< " hot:" << hot_count
<< " hot_bound:" << hot_bound_count
<< " stack_request:" << stack_request_count
<< " queue:" << pl_queue.count()
<< " queue_request:" << queue_request_count
<< " queuewait:" << queuewait_count << endl;
debugOut << "PGMAN:";
......@@ -2141,6 +2200,8 @@ Pgman::get_sublist_name(Uint32 list_no)
return "busy";
case Page_entry::SL_LOCKED:
return "locked";
case Page_entry::SL_IDLE:
return "idle";
case Page_entry::SL_OTHER:
return "other";
}
......
......@@ -325,8 +325,9 @@ private:
,SL_CALLBACK_IO = 4
,SL_BUSY = 5
,SL_LOCKED = 6
,SL_OTHER = 7
,SUBLIST_COUNT = 8
,SL_IDLE = 7
,SL_OTHER = 8
,SUBLIST_COUNT = 9
};
Uint16 m_file_no; // disk page address set at seize
......@@ -401,6 +402,7 @@ private:
struct Param {
Param();
Uint32 m_max_pages; // max number of cache pages
Uint32 m_lirs_stack_mult; // in m_max_pages (around 3-10)
Uint32 m_max_hot_pages; // max hot cache pages (up to 99%)
Uint32 m_max_loop_count; // limit purely local loops
Uint32 m_max_io_waits;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment