mp.h 12.8 KB
Newer Older
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
1 2 3
/*-
 * See the file LICENSE for redistribution information.
 *
jimw@mysql.com's avatar
jimw@mysql.com committed
4
 * Copyright (c) 1996-2005
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
5 6
 *	Sleepycat Software.  All rights reserved.
 *
jimw@mysql.com's avatar
jimw@mysql.com committed
7
 * $Id: mp.h,v 12.5 2005/08/08 14:52:30 bostic Exp $
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
8 9 10 11 12 13 14 15 16 17 18 19 20
 */

#ifndef	_DB_MP_H_
#define	_DB_MP_H_

struct __bh;		typedef struct __bh BH;
struct __db_mpool_hash; typedef struct __db_mpool_hash DB_MPOOL_HASH;
struct __db_mpreg;	typedef struct __db_mpreg DB_MPREG;
struct __mpool;		typedef struct __mpool MPOOL;

				/* We require at least 20KB of cache. */
#define	DB_CACHESIZE_MIN	(20 * 1024)

jimw@mysql.com's avatar
jimw@mysql.com committed
21 22 23 24 25 26 27 28 29 30 31
/*
 * DB_MPOOLFILE initialization methods cannot be called after open is called,
 * other methods cannot be called before open is called
 */
#define	MPF_ILLEGAL_AFTER_OPEN(dbmfp, name)				\
	if (F_ISSET(dbmfp, MP_OPEN_CALLED))				\
		return (__db_mi_open((dbmfp)->dbenv, name, 1));
#define	MPF_ILLEGAL_BEFORE_OPEN(dbmfp, name)				\
	if (!F_ISSET(dbmfp, MP_OPEN_CALLED))				\
		return (__db_mi_open((dbmfp)->dbenv, name, 0));

ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
32 33 34 35 36 37 38 39 40 41 42 43 44
typedef enum {
	DB_SYNC_ALLOC,		/* Flush for allocation. */
	DB_SYNC_CACHE,		/* Checkpoint or flush entire cache. */
	DB_SYNC_FILE,		/* Flush file. */
	DB_SYNC_TRICKLE		/* Trickle sync. */
} db_sync_op;

/*
 * DB_MPOOL --
 *	Per-process memory pool structure.
 */
struct __db_mpool {
	/* These fields need to be protected for multi-threaded support. */
jimw@mysql.com's avatar
jimw@mysql.com committed
45
	db_mutex_t mutex;		/* Thread mutex. */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
46

jimw@mysql.com's avatar
jimw@mysql.com committed
47 48 49 50 51 52
	/*
	 * DB_MPREG structure for the DB pgin/pgout routines.
	 *
	 * Linked list of application-specified pgin/pgout routines.
	 */
	DB_MPREG *pg_inout;
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
	LIST_HEAD(__db_mpregh, __db_mpreg) dbregq;

					/* List of DB_MPOOLFILE's. */
	TAILQ_HEAD(__db_mpoolfileh, __db_mpoolfile) dbmfq;

	/*
	 * The dbenv, nreg and reginfo fields are not thread protected,
	 * as they are initialized during mpool creation, and not modified
	 * again.
	 */
	DB_ENV	   *dbenv;		/* Enclosing environment. */

	u_int32_t   nreg;		/* N underlying cache regions. */
	REGINFO	   *reginfo;		/* Underlying cache regions. */
};

/*
 * DB_MPREG --
 *	DB_MPOOL registry of pgin/pgout functions.
 */
struct __db_mpreg {
	LIST_ENTRY(__db_mpreg) q;	/* Linked list. */

	int32_t ftype;			/* File type. */
					/* Pgin, pgout routines. */
	int (*pgin) __P((DB_ENV *, db_pgno_t, void *, DBT *));
	int (*pgout) __P((DB_ENV *, db_pgno_t, void *, DBT *));
};

/*
 * NCACHE --
 *	Select a cache based on the file and the page number.  Assumes accesses
 *	are uniform across pages, which is probably OK.  What we really want to
 *	avoid is anything that puts all pages from any single file in the same
 *	cache, as we expect that file access will be bursty, and to avoid
 *	putting all page number N pages in the same cache as we expect access
 *	to the metapages (page 0) and the root of a btree (page 1) to be much
 *	more frequent than a random data page.
 */
#define	NCACHE(mp, mf_offset, pgno)					\
jimw@mysql.com's avatar
jimw@mysql.com committed
93
	(((pgno) ^ ((u_int32_t)(mf_offset) >> 3)) % ((MPOOL *)mp)->nreg)
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109

/*
 * NBUCKET --
 *	 We make the assumption that early pages of the file are more likely
 *	 to be retrieved than the later pages, which means the top bits will
 *	 be more interesting for hashing as they're less likely to collide.
 *	 That said, as 512 8K pages represents a 4MB file, so only reasonably
 *	 large files will have page numbers with any other than the bottom 9
 *	 bits set.  We XOR in the MPOOL offset of the MPOOLFILE that backs the
 *	 page, since that should also be unique for the page.  We don't want
 *	 to do anything very fancy -- speed is more important to us than using
 *	 good hashing.
 */
#define	NBUCKET(mc, mf_offset, pgno)					\
	(((pgno) ^ ((mf_offset) << 9)) % (mc)->htab_buckets)

jimw@mysql.com's avatar
jimw@mysql.com committed
110 111 112 113 114 115 116 117 118 119 120 121 122 123
/* Macros to lock/unlock the mpool region as a whole. */
#define	MPOOL_SYSTEM_LOCK(dbenv)					\
	MUTEX_LOCK(dbenv, ((MPOOL *)((DB_MPOOL *)			\
	    (dbenv)->mp_handle)->reginfo[0].primary)->mtx_region)
#define	MPOOL_SYSTEM_UNLOCK(dbenv)					\
	MUTEX_UNLOCK(dbenv, ((MPOOL *)((DB_MPOOL *)			\
	    (dbenv)->mp_handle)->reginfo[0].primary)->mtx_region)

/* Macros to lock/unlock a specific mpool region. */
#define	MPOOL_REGION_LOCK(dbenv, infop)					\
	MUTEX_LOCK(dbenv, ((MPOOL *)(infop)->primary)->mtx_region)
#define	MPOOL_REGION_UNLOCK(dbenv, infop)				\
	MUTEX_UNLOCK(dbenv, ((MPOOL *)(infop)->primary)->mtx_region)

ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138
/*
 * MPOOL --
 *	Shared memory pool region.
 */
struct __mpool {
	/*
	 * The memory pool can be broken up into individual pieces/files.
	 * Not what we would have liked, but on Solaris you can allocate
	 * only a little more than 2GB of memory in a contiguous chunk,
	 * and I expect to see more systems with similar issues.
	 *
	 * While this structure is duplicated in each piece of the cache,
	 * the first of these pieces/files describes the entire pool, the
	 * second only describe a piece of the cache.
	 */
jimw@mysql.com's avatar
jimw@mysql.com committed
139
	db_mutex_t	mtx_region;	/* Region mutex. */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
140 141 142 143 144 145 146 147 148

	/*
	 * The lsn field and list of underlying MPOOLFILEs are thread protected
	 * by the region lock.
	 */
	DB_LSN	  lsn;			/* Maximum checkpoint LSN. */

	SH_TAILQ_HEAD(__mpfq) mpfq;	/* List of MPOOLFILEs. */

jimw@mysql.com's avatar
jimw@mysql.com committed
149 150 151 152 153 154
	/* Configuration information: protected by the region lock. */
	size_t mp_mmapsize;		/* Maximum file size for mmap. */
	int    mp_maxopenfd;		/* Maximum open file descriptors. */
	int    mp_maxwrite;		/* Maximum buffers to write. */
	int    mp_maxwrite_sleep;	/* Sleep after writing max buffers. */

ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
	/*
	 * The nreg, regids and maint_off fields are not thread protected,
	 * as they are initialized during mpool creation, and not modified
	 * again.
	 */
	u_int32_t nreg;			/* Number of underlying REGIONS. */
	roff_t	  regids;		/* Array of underlying REGION Ids. */

	/*
	 * The following structure fields only describe the per-cache portion
	 * of the region.
	 *
	 * The htab and htab_buckets fields are not thread protected as they
	 * are initialized during mpool creation, and not modified again.
	 *
	 * The last_checked and lru_count fields are thread protected by
	 * the region lock.
	 */
jimw@mysql.com's avatar
jimw@mysql.com committed
173 174 175 176
	u_int32_t htab_buckets;	/* Number of hash table entries. */
	roff_t	  htab;		/* Hash table offset. */
	u_int32_t last_checked;	/* Last bucket checked for free. */
	u_int32_t lru_count;		/* Counter for buffer LRU */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
177 178 179 180 181 182 183 184

	/*
	 * The stat fields are generally not thread protected, and cannot be
	 * trusted.  Note that st_pages is an exception, and is always updated
	 * inside a region lock (although it is sometimes read outside of the
	 * region lock).
	 */
	DB_MPOOL_STAT stat;		/* Per-cache mpool statistics. */
jimw@mysql.com's avatar
jimw@mysql.com committed
185 186 187 188 189 190 191

	/*
	 * We track page puts so that we can decide when allocation is never
	 * going to succeed.  We don't lock the field, all we care about is
	 * if it changes.
	 */
	u_int32_t  put_counter;		/* Count of page put calls. */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
192 193 194
};

struct __db_mpool_hash {
jimw@mysql.com's avatar
jimw@mysql.com committed
195
	db_mutex_t	mtx_hash;	/* Per-bucket mutex. */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
196 197 198 199 200 201 202 203 204 205 206 207

	DB_HASHTAB	hash_bucket;	/* Head of bucket. */

	u_int32_t	hash_page_dirty;/* Count of dirty pages. */
	u_int32_t	hash_priority;	/* Minimum priority of bucket buffer. */
};

/*
 * The base mpool priority is 1/4th of the name space, or just under 2^30.
 * When the LRU counter wraps, we shift everybody down to a base-relative
 * value.
 */
jimw@mysql.com's avatar
jimw@mysql.com committed
208
#define	MPOOL_BASE_DECREMENT	(UINT32_MAX - (UINT32_MAX / 4))
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225

/*
 * Mpool priorities from low to high.  Defined in terms of fractions of the
 * buffers in the pool.
 */
#define	MPOOL_PRI_VERY_LOW	-1	/* Dead duck.  Check and set to 0. */
#define	MPOOL_PRI_LOW		-2	/* Low. */
#define	MPOOL_PRI_DEFAULT	0	/* No adjustment -- special case.*/
#define	MPOOL_PRI_HIGH		10	/* With the dirty buffers. */
#define	MPOOL_PRI_DIRTY		10	/* Dirty gets a 10% boost. */
#define	MPOOL_PRI_VERY_HIGH	1	/* Add number of buffers in pool. */

/*
 * MPOOLFILE --
 *	Shared DB_MPOOLFILE information.
 */
struct __mpoolfile {
jimw@mysql.com's avatar
jimw@mysql.com committed
226
	db_mutex_t mutex;		/* MPOOLFILE mutex. */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
227 228 229 230 231 232 233

	/* Protected by MPOOLFILE mutex. */
	u_int32_t mpf_cnt;		/* Ref count: DB_MPOOLFILEs. */
	u_int32_t block_cnt;		/* Ref count: blocks in cache. */

	roff_t	  path_off;		/* File name location. */

jimw@mysql.com's avatar
jimw@mysql.com committed
234 235 236 237 238 239 240 241 242 243 244 245
	/*
	 * The following are used for file compaction processing.
	 * They are only used when a thread is in the process
	 * of trying to move free pages to the end of the file.
	 * Other threads may look here when freeing a page.
	 * Protected by a lock on the metapage.
	 */
	u_int32_t free_ref;		/* Refcount to freelist. */
	u_int32_t free_cnt;		/* Count of free pages. */
	size_t	  free_size;		/* Allocated size of free list. */
	roff_t	  free_list;		/* Offset to free list. */

jimw@mysql.com's avatar
jimw@mysql.com committed
246 247 248 249 250 251 252
	/*
	 * We normally don't lock the deadfile field when we read it since we
	 * only care if the field is zero or non-zero.  We do lock on read when
	 * searching for a matching MPOOLFILE -- see that code for more detail.
	 */
	int32_t	  deadfile;		/* Dirty pages can be discarded. */

ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
253 254 255 256
	/* Protected by mpool cache 0 region lock. */
	SH_TAILQ_ENTRY q;		/* List of MPOOLFILEs */
	db_pgno_t last_pgno;		/* Last page in the file. */
	db_pgno_t orig_last_pgno;	/* Original last page in the file. */
jimw@mysql.com's avatar
jimw@mysql.com committed
257
	db_pgno_t maxpgno;		/* Maximum page number. */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
258 259 260 261 262 263 264 265

	/*
	 * None of the following fields are thread protected.
	 *
	 * There are potential races with the ftype field because it's read
	 * without holding a lock.  However, it has to be set before adding
	 * any buffers to the cache that depend on it being set, so there
	 * would need to be incorrect operation ordering to have a problem.
jimw@mysql.com's avatar
jimw@mysql.com committed
266 267 268 269
	 */
	int32_t	  ftype;		/* File type. */

	/*
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
270 271 272
	 * There are potential races with the priority field because it's read
	 * without holding a lock.  However, a collision is unlikely and if it
	 * happens is of little consequence.
jimw@mysql.com's avatar
jimw@mysql.com committed
273 274 275 276 277 278 279 280 281 282 283 284 285 286 287
	 */
	int32_t   priority;		/* Priority when unpinning buffer. */

	/*
	 * There are potential races with the file_written field (many threads
	 * may be writing blocks at the same time), and with no_backing_file
	 * and unlink_on_close fields, as they may be set while other threads
	 * are reading them.  However, we only care if the field value is zero
	 * or non-zero, so don't lock the memory.
	 *
	 * !!!
	 * Theoretically, a 64-bit architecture could put two of these fields
	 * in a single memory operation and we could race.  I have never seen
	 * an architecture where that's a problem, and I believe Java requires
	 * that to never be the case.
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
288
	 *
jimw@mysql.com's avatar
jimw@mysql.com committed
289 290 291 292 293 294 295 296 297
	 * File_written is set whenever a buffer is marked dirty in the cache.
	 * It can be cleared in some cases, after all dirty buffers have been
	 * written AND the file has been flushed to disk.
	 */
	int32_t	  file_written;		/* File was written. */
	int32_t	  no_backing_file;	/* Never open a backing file. */
	int32_t	  unlink_on_close;	/* Unlink file on last close. */

	/*
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
298 299 300 301 302 303
	 * We do not protect the statistics in "stat" because of the cost of
	 * the mutex in the get/put routines.  There is a chance that a count
	 * will get lost.
	 */
	DB_MPOOL_FSTAT stat;		/* Per-file mpool statistics. */

jimw@mysql.com's avatar
jimw@mysql.com committed
304 305 306 307
	/*
	 * The remaining fields are initialized at open and never subsequently
	 * modified.
	 */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
308 309 310 311 312 313 314 315
	int32_t	  lsn_off;		/* Page's LSN offset. */
	u_int32_t clear_len;		/* Bytes to clear on page create. */

	roff_t	  fileid_off;		/* File ID string location. */

	roff_t	  pgcookie_len;		/* Pgin/pgout cookie length. */
	roff_t	  pgcookie_off;		/* Pgin/pgout cookie location. */

jimw@mysql.com's avatar
jimw@mysql.com committed
316 317 318 319 320 321 322 323 324 325 326 327 328
	/*
	 * The flags are initialized at open and never subsequently modified.
	 */
#define	MP_CAN_MMAP		0x001	/* If the file can be mmap'd. */
#define	MP_DIRECT		0x002	/* No OS buffering. */
#define	MP_DURABLE_UNKNOWN	0x004	/* We don't care about durability. */
#define	MP_EXTENT		0x008	/* Extent file. */
#define	MP_FAKE_DEADFILE	0x010	/* Deadfile field: fake flag. */
#define	MP_FAKE_FILEWRITTEN	0x020	/* File_written field: fake flag. */
#define	MP_FAKE_NB		0x040	/* No_backing_file field: fake flag. */
#define	MP_FAKE_UOC		0x080	/* Unlink_on_close field: fake flag. */
#define	MP_NOT_DURABLE		0x100	/* File is not durable. */
#define	MP_TEMP			0x200	/* Backing file is a temporary. */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
329 330 331
	u_int32_t  flags;
};

jimw@mysql.com's avatar
jimw@mysql.com committed
332 333 334 335 336 337
/*
 * Flags to __memp_bh_free.
 */
#define	BH_FREE_FREEMEM		0x01
#define	BH_FREE_UNLOCKED	0x02

ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
338 339 340 341 342
/*
 * BH --
 *	Buffer header.
 */
struct __bh {
jimw@mysql.com's avatar
jimw@mysql.com committed
343
	db_mutex_t	mtx_bh;		/* Buffer thread/process mutex. */
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370

	u_int16_t	ref;		/* Reference count. */
	u_int16_t	ref_sync;	/* Sync wait-for reference count. */

#define	BH_CALLPGIN	0x001		/* Convert the page before use. */
#define	BH_DIRTY	0x002		/* Page was modified. */
#define	BH_DIRTY_CREATE	0x004		/* Page created, must be written. */
#define	BH_DISCARD	0x008		/* Page is useless. */
#define	BH_LOCKED	0x010		/* Page is locked (I/O in progress). */
#define	BH_TRASH	0x020		/* Page is garbage. */
	u_int16_t	flags;

	u_int32_t	priority;	/* LRU priority. */
	SH_TAILQ_ENTRY	hq;		/* MPOOL hash bucket queue. */

	db_pgno_t pgno;			/* Underlying MPOOLFILE page number. */
	roff_t	  mf_offset;		/* Associated MPOOLFILE offset. */

	/*
	 * !!!
	 * This array must be at least size_t aligned -- the DB access methods
	 * put PAGE and other structures into it, and then access them directly.
	 * (We guarantee size_t alignment to applications in the documentation,
	 * too.)
	 */
	u_int8_t   buf[1];		/* Variable length data. */
};
jimw@mysql.com's avatar
jimw@mysql.com committed
371 372 373 374
/*
 * Flags to __memp_ftruncate.
 */
#define	MP_TRUNC_RECOVER	0x01
ram@mysql.r18.ru's avatar
ram@mysql.r18.ru committed
375 376 377

#include "dbinc_auto/mp_ext.h"
#endif /* !_DB_MP_H_ */