Commit 215f0a2a authored by Zardosht Kasheff's avatar Zardosht Kasheff Committed by Yoni Fogel

[t:4235], modify a bunch of comments as a result of the code review

git-svn-id: file:///svn/toku/tokudb@37576 c7de825b-a66e-492c-adef-691d508d4ae1
parent 61478b29
...@@ -280,7 +280,7 @@ struct cachefile { ...@@ -280,7 +280,7 @@ struct cachefile {
int n_background_jobs; // how many jobs in the cachetable's kibbutz or int n_background_jobs; // how many jobs in the cachetable's kibbutz or
// on the cleaner thread (anything // on the cleaner thread (anything
// cachetable_flush_cachefile should wait on) // cachetable_flush_cachefile should wait on)
// are working on this. Each job should, at the // are working on this cachefile. Each job should, at the
// end, obtain the cachetable mutex, decrement // end, obtain the cachetable mutex, decrement
// this variable, and broadcast the // this variable, and broadcast the
// kibbutz_wait condition variable to let // kibbutz_wait condition variable to let
...@@ -320,7 +320,7 @@ void remove_background_job(CACHEFILE cf, bool already_locked) ...@@ -320,7 +320,7 @@ void remove_background_job(CACHEFILE cf, bool already_locked)
} }
void cachefile_kibbutz_enq (CACHEFILE cf, void (*f)(void*), void *extra) void cachefile_kibbutz_enq (CACHEFILE cf, void (*f)(void*), void *extra)
// The function f must call remove_background_job is true // The function f must call remove_background_job when it completes
{ {
add_background_job(cf, false); add_background_job(cf, false);
toku_kibbutz_enq(cf->cachetable->kibbutz, f, extra); toku_kibbutz_enq(cf->cachetable->kibbutz, f, extra);
...@@ -1018,6 +1018,12 @@ int toku_cachefile_close (CACHEFILE *cfp, char **error_string, BOOL oplsn_valid, ...@@ -1018,6 +1018,12 @@ int toku_cachefile_close (CACHEFILE *cfp, char **error_string, BOOL oplsn_valid,
} }
} }
//
// This client calls this function to flush all PAIRs belonging to
// a cachefile from the cachetable. The client must ensure that
// while this function is called, no other thread does work on the
// cachefile.
//
int toku_cachefile_flush (CACHEFILE cf) { int toku_cachefile_flush (CACHEFILE cf) {
CACHETABLE ct = cf->cachetable; CACHETABLE ct = cf->cachetable;
cachetable_lock(ct); cachetable_lock(ct);
...@@ -1752,6 +1758,13 @@ write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p) ...@@ -1752,6 +1758,13 @@ write_locked_pair_for_checkpoint(CACHETABLE ct, PAIR p)
} }
} }
//
// For each PAIR associated with these CACHEFILEs and CACHEKEYs
// if the checkpoint_pending bit is set and the PAIR is dirty, write the PAIR
// to disk.
// We assume the PAIRs passed in have been locked by the client that made calls
// into the cachetable that eventually make it here.
//
static void checkpoint_dependent_pairs( static void checkpoint_dependent_pairs(
CACHETABLE ct, CACHETABLE ct,
u_int32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint u_int32_t num_dependent_pairs, // number of dependent pairs that we may need to checkpoint
...@@ -1949,24 +1962,19 @@ write_pair_for_checkpoint (CACHETABLE ct, PAIR p) ...@@ -1949,24 +1962,19 @@ write_pair_for_checkpoint (CACHETABLE ct, PAIR p)
write_for_checkpoint_pair = NULL; write_for_checkpoint_pair = NULL;
} }
//
// cachetable lock and PAIR lock are held on entry
// On exit, cachetable lock is still held, but PAIR lock
// is either released or ownership of PAIR lock is transferred
// via the completion queue.
//
static void static void
do_partial_fetch(CACHETABLE ct, CACHEFILE cachefile, PAIR p, CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback, void *read_extraargs) do_partial_fetch(CACHETABLE ct, CACHEFILE cachefile, PAIR p, CACHETABLE_PARTIAL_FETCH_CALLBACK pf_callback, void *read_extraargs)
{ {
PAIR_ATTR old_attr = p->attr; PAIR_ATTR old_attr = p->attr;
PAIR_ATTR new_attr = zero_attr; PAIR_ATTR new_attr = zero_attr;
// // As of Dr. No, only clean PAIRs may have pieces missing,
// The reason we have this assert is a sanity check // so we do a sanity check here.
// to make sure that it is ok to set the
// state of the pair to CTPAIR_READING.
//
// As of this writing, the checkpoint code assumes
// that every pair that is in the CTPAIR_READING state
// is not dirty. Because we require dirty nodes to be
// fully in memory, we should never have a dirty node
// require a partial fetch. So, just to be sure that
// we can set the pair to CTPAIR_READING, we assert
// that the pair is not dirty
//
assert(!p->dirty); assert(!p->dirty);
p->state = CTPAIR_READING; p->state = CTPAIR_READING;
...@@ -2078,19 +2086,8 @@ int toku_cachetable_get_and_pin_with_dep_pairs ( ...@@ -2078,19 +2086,8 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
// and then call a callback to retrieve what we need // and then call a callback to retrieve what we need
// //
if (partial_fetch_required) { if (partial_fetch_required) {
// // As of Dr. No, only clean PAIRs may have pieces missing,
// The reason we have this assert is a sanity check // so we do a sanity check here.
// to make sure that it is ok to set the
// state of the pair to CTPAIR_READING.
//
// As of this writing, the checkpoint code assumes
// that every pair that is in the CTPAIR_READING state
// is not dirty. Because we require dirty nodes to be
// fully in memory, we should never have a dirty node
// require a partial fetch. So, just to be sure that
// we can set the pair to CTPAIR_READING, we assert
// that the pair is not dirty
//
assert(!p->dirty); assert(!p->dirty);
p->state = CTPAIR_READING; p->state = CTPAIR_READING;
......
...@@ -143,33 +143,39 @@ typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void ...@@ -143,33 +143,39 @@ typedef void (*CACHETABLE_FLUSH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, void
// Can access fd (fd is protected by a readlock during call) // Can access fd (fd is protected by a readlock during call)
typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, PAIR_ATTR *sizep, int *dirtyp, void *read_extraargs); typedef int (*CACHETABLE_FETCH_CALLBACK)(CACHEFILE, int fd, CACHEKEY key, u_int32_t fullhash, void **value, PAIR_ATTR *sizep, int *dirtyp, void *read_extraargs);
// The partial eviction estimate callback is a cheap operation called by the cachetable on the client thread // The cachetable calls the partial eviction estimate callback to determine if
// to determine whether partial eviction is cheap and can be run on the client thread, or partial eviction // partial eviction is a cheap operation that may be called by on the client thread
// is expensive and should be done on a background (writer) thread. If the callback says that // or whether partial eviction is expensive and should be done on a background (writer) thread.
// partial eviction is expensive, it returns an estimate of the number of bytes it will free // The callback conveys this information by setting cost to either PE_CHEAP or PE_EXPENSIVE.
// so that the cachetable can estimate how much data is being evicted on background threads // If cost is PE_EXPENSIVE, then the callback also sets bytes_freed_estimate
// to return an estimate of the number of bytes it will free
// so that the cachetable can estimate how much data is being evicted on background threads.
// If cost is PE_CHEAP, then the callback does not set bytes_freed_estimate.
typedef void (*CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK)(void *brtnode_pv, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void *write_extraargs); typedef void (*CACHETABLE_PARTIAL_EVICTION_EST_CALLBACK)(void *brtnode_pv, long* bytes_freed_estimate, enum partial_eviction_cost *cost, void *write_extraargs);
// The partial eviction callback is called by the cachetable to possibly try and partially evict pieces // The cachetable calls the partial eviction callback is to possibly try and partially evict pieces
// of the PAIR. The strategy for what to evict is left to the callback. The callback may choose to free // of the PAIR. The callback determines the strategy for what to evict. The callback may choose to free
// nothing, may choose to free as much as possible. // nothing, or may choose to free as much as possible.
// bytes_to_free is the number of bytes the cachetable wants freed. // old_attr is the PAIR_ATTR of the PAIR when the callback is called.
// bytes_freed is returned by the callback telling the cachetable how much space was freed // new_attr is set to the new PAIR_ATTR after the callback executes partial eviction
// Requires a write lock to be held on the PAIR in the cachetable while this function is called // Requires a write lock to be held on the PAIR in the cachetable while this function is called
typedef int (*CACHETABLE_PARTIAL_EVICTION_CALLBACK)(void *brtnode_pv, PAIR_ATTR old_attr, PAIR_ATTR* new_attr, void *write_extraargs); typedef int (*CACHETABLE_PARTIAL_EVICTION_CALLBACK)(void *brtnode_pv, PAIR_ATTR old_attr, PAIR_ATTR* new_attr, void *write_extraargs);
// This callback is called by the cachetable to ask if a partial fetch is required of brtnode_pv. If a partial fetch // The cachetable calls this function to determine if get_and_pin call requires a partial fetch. If this function returns TRUE,
// is required, then CACHETABLE_PARTIAL_FETCH_CALLBACK is called (possibly with ydb lock released). The reason // then the cachetable will subsequently call CACHETABLE_PARTIAL_FETCH_CALLBACK to perform
// this callback exists instead of just doing the same functionality in CACHETABLE_PARTIAL_FETCH_CALLBACK // a partial fetch. If this function returns FALSE, then the PAIR's value is returned to the caller as is.
// is so that we can call this cheap function with the ydb lock held, in the hopes of avoiding the more expensive sequence //
// of releasing the ydb lock, calling the partial_fetch_callback, reading nothing, reacquiring the ydb lock // An alternative to having this callback is to always call CACHETABLE_PARTIAL_FETCH_CALLBACK, and let
// CACHETABLE_PARTIAL_FETCH_CALLBACK decide whether to possibly release the ydb lock and perform I/O.
// There is no particular reason why this alternative was not chosen.
// Requires: a read lock to be held on the PAIR // Requires: a read lock to be held on the PAIR
typedef BOOL (*CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK)(void *brtnode_pv, void *read_extraargs); typedef BOOL (*CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK)(void *brtnode_pv, void *read_extraargs);
// The partial fetch callback is called when a thread needs to read a subset of a PAIR into memory. // The cachetable calls the partial fetch callback when a thread needs to read or decompress a subset of a PAIR into memory.
// An example would be needing to read a basement node into memory. In that case, // An example is needing to read a basement node into memory. Another example is decompressing an internal node's
// Requires a write lock to be held on the PAIR in the cachetable while this function is called // message buffer. The cachetable determines if a partial fetch is necessary by first calling CACHETABLE_PARTIAL_FETCH_REQUIRED_CALLBACK.
// The new size of the PAIR is returned in sizep // The new PAIR_ATTR of the PAIR is returned in sizep
// Can access fd (fd is protected by a readlock during call)
// Returns: 0 if success, otherwise an error number. // Returns: 0 if success, otherwise an error number.
typedef int (*CACHETABLE_PARTIAL_FETCH_CALLBACK)(void *brtnode_pv, void *read_extraargs, int fd, PAIR_ATTR *sizep); typedef int (*CACHETABLE_PARTIAL_FETCH_CALLBACK)(void *brtnode_pv, void *read_extraargs, int fd, PAIR_ATTR *sizep);
...@@ -248,6 +254,16 @@ int toku_cachetable_put(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash, ...@@ -248,6 +254,16 @@ int toku_cachetable_put(CACHEFILE cf, CACHEKEY key, u_int32_t fullhash,
); );
// Get and pin the memory object of a PAIR, and write dependent pairs to disk
// if the dependent pairs are pending a checkpoint.
// Effects: If the memory object is in the cachetable, acquire a PAIR lock on it.
// Otherwise, fetch it from storage by calling the fetch callback. If the fetch
// succeeded, add the memory object to the cachetable with a PAIR lock on it.
// Before returning to the user, if the PAIR object being retrieved, or any of the
// dependent pairs passed in as parameters must be written to disk for checkpoint,
// then the required PAIRs are written to disk for checkpoint.
// KEY PROPERTY OF DEPENDENT PAIRS: They are already locked by the client
// Returns: 0 if the memory object is in memory, otherwise an error number.
int toku_cachetable_get_and_pin_with_dep_pairs ( int toku_cachetable_get_and_pin_with_dep_pairs (
CACHEFILE cachefile, CACHEFILE cachefile,
CACHEKEY key, CACHEKEY key,
...@@ -272,7 +288,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs ( ...@@ -272,7 +288,7 @@ int toku_cachetable_get_and_pin_with_dep_pairs (
// Get and pin a memory object. // Get and pin a memory object.
// Effects: If the memory object is in the cachetable, acquire a read lock on it. // Effects: If the memory object is in the cachetable acquire the PAIR lock on it.
// Otherwise, fetch it from storage by calling the fetch callback. If the fetch // Otherwise, fetch it from storage by calling the fetch callback. If the fetch
// succeeded, add the memory object to the cachetable with a read lock on it. // succeeded, add the memory object to the cachetable with a read lock on it.
// Returns: 0 if the memory object is in memory, otherwise an error number. // Returns: 0 if the memory object is in memory, otherwise an error number.
......
...@@ -69,6 +69,12 @@ static void ksignal (KIBBUTZ k) { ...@@ -69,6 +69,12 @@ static void ksignal (KIBBUTZ k) {
assert(r==0); assert(r==0);
} }
//
// pops the tail of the kibbutz off the list and works on it
// Note that in toku_kibbutz_enq, items are enqueued at the head,
// making the work be done in FIFO order. This is necessary
// to avoid deadlocks in flusher threads.
//
static void *work_on_kibbutz (void *kidv) { static void *work_on_kibbutz (void *kidv) {
struct kid *kid = kidv; struct kid *kid = kidv;
KIBBUTZ k = kid->k; KIBBUTZ k = kid->k;
...@@ -100,6 +106,12 @@ static void *work_on_kibbutz (void *kidv) { ...@@ -100,6 +106,12 @@ static void *work_on_kibbutz (void *kidv) {
} }
} }
//
// adds work to the head of the kibbutz
// Note that in work_on_kibbutz, items are popped off the tail for work,
// making the work be done in FIFO order. This is necessary
// to avoid deadlocks in flusher threads.
//
void toku_kibbutz_enq (KIBBUTZ k, void (*f)(void*), void *extra) { void toku_kibbutz_enq (KIBBUTZ k, void (*f)(void*), void *extra) {
struct todo *XMALLOC(td); struct todo *XMALLOC(td);
td->f = f; td->f = f;
......
...@@ -7,9 +7,35 @@ ...@@ -7,9 +7,35 @@
C_BEGIN C_BEGIN
//
// The kibbutz is another threadpool meant to do arbitrary work.
// It is introduced in Dr. No, and as of Dr. No, the only work kibbutzim
// do is flusher thread work. In Dr. No, we already have a threadpool for
// the writer threads and a threadpool for serializing brtnodes. A natural
// question is why did we introduce another threadpool in Dr. No. The short
// answer is that this was the simplest way to get the flusher threads work
// done.
//
typedef struct kibbutz *KIBBUTZ; typedef struct kibbutz *KIBBUTZ;
//
// create a kibbutz where n_workers is the number of threads in the threadpool
//
KIBBUTZ toku_kibbutz_create (int n_workers); KIBBUTZ toku_kibbutz_create (int n_workers);
//
// enqueue a workitem in the kibbutz. When the kibbutz is to work on this workitem,
// it calls f(extra).
// At any time, the kibbutz is operating on at most n_workers jobs.
// Other enqueued workitems are on a queue. An invariant is
// that no currently enqueued item was placed on the queue before
// any item that is currently being operated on. Another way to state
// this is that all items on the queue were placed there before any item
// that is currently being worked on
//
void toku_kibbutz_enq (KIBBUTZ k, void (*f)(void*), void *extra); void toku_kibbutz_enq (KIBBUTZ k, void (*f)(void*), void *extra);
//
// destroys the kibbutz
//
void toku_kibbutz_destroy (KIBBUTZ k); void toku_kibbutz_destroy (KIBBUTZ k);
C_END C_END
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment