buf0rea.cc 26 KB
Newer Older
1 2
/*****************************************************************************

3
Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4
Copyright (c) 2015, 2018, MariaDB Corporation.
5 6 7 8 9 10 11 12 13 14

This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.

You should have received a copy of the GNU General Public License along with
15 16
this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17 18 19

*****************************************************************************/

20
/**************************************************//**
21
@file buf/buf0rea.cc
osku's avatar
osku committed
22 23 24 25 26
The database buffer read

Created 11/5/1995 Heikki Tuuri
*******************************************************/

27 28
#include "ha_prototypes.h"
#include <mysql/service_thd_wait.h>
osku's avatar
osku committed
29

30
#include "buf0rea.h"
osku's avatar
osku committed
31 32 33 34 35
#include "fil0fil.h"
#include "mtr0mtr.h"
#include "buf0buf.h"
#include "buf0flu.h"
#include "buf0lru.h"
36
#include "buf0dblwr.h"
osku's avatar
osku committed
37 38 39 40 41
#include "ibuf0ibuf.h"
#include "log0recv.h"
#include "trx0sys.h"
#include "os0file.h"
#include "srv0start.h"
42
#include "srv0srv.h"
osku's avatar
osku committed
43

44
/** There must be at least this many pages in buf_pool in the area to start
45
a random read-ahead */
46 47
#define BUF_READ_AHEAD_RANDOM_THRESHOLD(b)	\
				(5 + BUF_READ_AHEAD_AREA(b) / 8)
osku's avatar
osku committed
48

49
/** If there are buf_pool->curr_size per the number below pending reads, then
osku's avatar
osku committed
50 51 52 53
read-ahead is not done: this is to prevent flooding the buffer pool with
i/o-fixed buffer blocks */
#define BUF_READ_AHEAD_PEND_LIMIT	2

54 55 56 57 58 59 60 61 62 63
/********************************************************************//**
Unfixes the pages, unlatches the page,
removes it from page_hash and removes it from LRU. */
static
void
buf_read_page_handle_error(
/*=======================*/
	buf_page_t*	bpage)	/*!< in: pointer to the block */
{
	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
64
	const bool	uncompressed = (buf_page_get_state(bpage)
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
					== BUF_BLOCK_FILE_PAGE);

	/* First unfix and release lock on the bpage */
	buf_pool_mutex_enter(buf_pool);
	mutex_enter(buf_page_get_mutex(bpage));
	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
	ut_ad(bpage->buf_fix_count == 0);

	/* Set BUF_IO_NONE before we remove the block from LRU list */
	buf_page_set_io_fix(bpage, BUF_IO_NONE);

	if (uncompressed) {
		rw_lock_x_unlock_gen(
			&((buf_block_t*) bpage)->lock,
			BUF_IO_READ);
	}

82 83
	mutex_exit(buf_page_get_mutex(bpage));

84 85 86 87 88 89 90 91 92
	/* remove the block from LRU list */
	buf_LRU_free_one_page(bpage);

	ut_ad(buf_pool->n_pend_reads > 0);
	buf_pool->n_pend_reads--;

	buf_pool_mutex_exit(buf_pool);
}

93
/** Low-level function which reads a page asynchronously from a file to the
osku's avatar
osku committed
94 95
buffer buf_pool if it is not already there, in which case does nothing.
Sets the io_fix flag and sets an exclusive lock on the buffer frame. The
96
flag is cleared and the x-lock released by an i/o-handler thread.
97 98 99 100 101 102 103 104 105 106 107

@param[out] err		DB_SUCCESS, DB_TABLESPACE_DELETED or
			DB_TABLESPACE_TRUNCATED if we are trying
			to read from a non-existent tablespace, a
			tablespace which is just now being dropped,
			or a tablespace which is truncated
@param[in] sync		true if synchronous aio is desired
@param[in] type		IO type, SIMULATED, IGNORE_MISSING
@param[in] mode		BUF_READ_IBUF_PAGES_ONLY, ...,
@param[in] page_id	page id
@param[in] unzip	true=request uncompressed page
108
@param[in] ignore_missing_space  true=ignore missing space when reading
109 110 111
@return 1 if a read request was queued, 0 if the page already resided
in buf_pool, or if the page is in the doublewrite buffer blocks in
which case it is never read into the pool, or if the tablespace does
112
not exist or is being dropped */
osku's avatar
osku committed
113 114 115
static
ulint
buf_read_page_low(
116 117 118 119 120 121
	dberr_t*		err,
	bool			sync,
	ulint			type,
	ulint			mode,
	const page_id_t&	page_id,
	const page_size_t&	page_size,
122 123
	bool			unzip,
	bool			ignore_missing_space = false)
osku's avatar
osku committed
124
{
125
	buf_page_t*	bpage;
osku's avatar
osku committed
126 127 128

	*err = DB_SUCCESS;

129 130
	if (page_id.space() == TRX_SYS_SPACE
	    && buf_dblwr_page_inside(page_id.page_no())) {
osku's avatar
osku committed
131

132 133
		ib::error() << "Trying to read doublewrite buffer page "
			<< page_id;
osku's avatar
osku committed
134 135 136
		return(0);
	}

137
	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
osku's avatar
osku committed
138 139 140 141

		/* Trx sys header is so low in the latching order that we play
		safe and do not leave the i/o-completion to an asynchronous
		i/o-thread. Ibuf bitmap pages must always be read with
142 143 144
		syncronous i/o, to make sure they do not get involved in
		thread deadlocks. */

145
		sync = true;
osku's avatar
osku committed
146 147 148 149 150 151
	}

	/* The following call will also check if the tablespace does not exist
	or is being dropped; if we succeed in initing the page in the buffer
	pool for read, then DISCARD cannot proceed until the read has
	completed */
152 153
	bpage = buf_page_init_for_read(err, mode, page_id, page_size, unzip);

154
	if (bpage == NULL) {
155

osku's avatar
osku committed
156 157 158
		return(0);
	}

Marko Mäkelä's avatar
Marko Mäkelä committed
159 160 161
	DBUG_LOG("ib_buf",
		 "read page " << page_id << " size=" << page_size.physical()
		 << " unzip=" << unzip << ',' << (sync ? "sync" : "async"));
osku's avatar
osku committed
162

163
	ut_ad(buf_page_in_file(bpage));
osku's avatar
osku committed
164

165 166 167 168
	if (sync) {
		thd_wait_begin(NULL, THD_WAIT_DISKIO);
	}

169 170 171 172
	void*	dst;

	if (page_size.is_compressed()) {
		dst = bpage->zip.data;
173
	} else {
174 175
		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);

176
		dst = ((buf_block_t*) bpage)->frame;
177
	}
178

179 180 181 182
	IORequest	request(type | IORequest::READ);

	*err = fil_io(
		request, sync, page_id, page_size, 0, page_size.physical(),
183
		dst, bpage, ignore_missing_space);
184

185 186 187
	if (sync) {
		thd_wait_end(NULL);
	}
188 189

	if (*err != DB_SUCCESS) {
190 191 192 193 194 195 196 197 198 199 200 201 202 203
		if (*err == DB_TABLESPACE_TRUNCATED) {
			/* Remove the page which is outside the
			truncated tablespace bounds when recovering
			from a crash happened during a truncation */
			buf_read_page_handle_error(bpage);
			if (recv_recovery_on) {
				mutex_enter(&recv_sys->mutex);
				ut_ad(recv_sys->n_addrs > 0);
				recv_sys->n_addrs--;
				mutex_exit(&recv_sys->mutex);
			}
			return(0);
		} else if (IORequest::ignore_missing(type)
			   || *err == DB_TABLESPACE_DELETED) {
204
			buf_read_page_handle_error(bpage);
205 206
			return(0);
		}
207

208 209
		ut_error;
	}
osku's avatar
osku committed
210 211 212 213

	if (sync) {
		/* The i/o is already completed when we arrive from
		fil_read */
214
		*err = buf_page_io_complete(bpage);
215

216
		if (*err != DB_SUCCESS) {
217 218
			return(0);
		}
osku's avatar
osku committed
219
	}
220

osku's avatar
osku committed
221
	return(1);
222
}
osku's avatar
osku committed
223

224
/** Applies a random read-ahead in buf_pool if there are at least a threshold
225 226 227 228 229 230 231
value of accessed pages from the random read-ahead area. Does not read any
page, not even the one at the position (space, offset), if the read-ahead
mechanism is not activated. NOTE 1: the calling thread may own latches on
pages: to avoid deadlocks this function must be written such that it cannot
end up waiting for these latches! NOTE 2: the calling thread must want
access to the page given: this rule is set to prevent unintended read-aheads
performed by ibuf routines, a situation which could result in a deadlock if
232
the OS does not support asynchronous i/o.
233 234 235 236
@param[in]	page_id		page id of a page which the current thread
wants to access
@param[in]	page_size	page size
@param[in]	inside_ibuf	TRUE if we are inside ibuf routine
237 238
@return number of page read requests issued; NOTE that if we read ibuf
pages, it may happen that the page at the given page number does not
239
get read even if we return a positive value! */
240 241
ulint
buf_read_ahead_random(
242 243 244
	const page_id_t&	page_id,
	const page_size_t&	page_size,
	ibool			inside_ibuf)
245
{
246
	buf_pool_t*	buf_pool = buf_pool_get(page_id);
247 248
	ulint		recent_blocks	= 0;
	ulint		ibuf_mode;
249
	ulint		count;
250
	ulint		low, high;
251
	dberr_t		err = DB_SUCCESS;
252
	ulint		i;
253 254 255 256 257 258 259
	const ulint	buf_read_ahead_random_area
				= BUF_READ_AHEAD_AREA(buf_pool);

	if (!srv_random_read_ahead) {
		/* Disabled by user */
		return(0);
	}
260

unknown's avatar
unknown committed
261
	if (srv_startup_is_before_trx_rollback_phase) {
262 263
		/* No read-ahead to avoid thread deadlocks */
		return(0);
unknown's avatar
unknown committed
264 265
	}

266
	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
267

unknown's avatar
unknown committed
268
		/* If it is an ibuf bitmap page or trx sys hdr, we do
269 270
		no read-ahead, as that could break the ibuf page access
		order */
271 272 273 274

		return(0);
	}

275
	low  = (page_id.page_no() / buf_read_ahead_random_area)
276
		* buf_read_ahead_random_area;
277 278

	high = (page_id.page_no() / buf_read_ahead_random_area + 1)
279
		* buf_read_ahead_random_area;
280

281 282 283 284
	/* Remember the tablespace version before we ask the tablespace size
	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
	do not try to read outside the bounds of the tablespace! */
	if (fil_space_t* space = fil_space_acquire(page_id.space())) {
285 286 287 288 289 290 291 292 293 294

#ifdef UNIV_DEBUG
		if (srv_file_per_table) {
			ulint	size = 0;

			for (const fil_node_t*	node =
				UT_LIST_GET_FIRST(space->chain);
			     node != NULL;
			     node = UT_LIST_GET_NEXT(chain, node)) {

295 296
				size += ulint(os_file_get_size(node->handle)
					/ page_size.physical());
297 298
			}

299
			ut_ad(size == space->size);
300 301 302
		}
#endif /* UNIV_DEBUG */

303 304 305 306 307 308
		if (high > space->size) {
			high = space->size;
		}
		fil_space_release(space);
	} else {
		return(0);
309 310
	}

311
	buf_pool_mutex_enter(buf_pool);
312

unknown's avatar
unknown committed
313 314
	if (buf_pool->n_pend_reads
	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
315
		buf_pool_mutex_exit(buf_pool);
316 317

		return(0);
318
	}
319 320 321 322 323

	/* Count how many blocks in the area have been recently accessed,
	that is, reside near the start of the LRU list. */

	for (i = low; i < high; i++) {
324 325
		const buf_page_t*	bpage = buf_page_hash_get(
			buf_pool, page_id_t(page_id.space(), i));
326

327
		if (bpage != NULL
328 329
		    && buf_page_is_accessed(bpage)
		    && buf_page_peek_if_young(bpage)) {
330 331

			recent_blocks++;
332

333 334
			if (recent_blocks
			    >= BUF_READ_AHEAD_RANDOM_THRESHOLD(buf_pool)) {
335

336 337 338 339
				buf_pool_mutex_exit(buf_pool);
				goto read_ahead;
			}
		}
340 341
	}

342 343 344 345 346
	buf_pool_mutex_exit(buf_pool);
	/* Do nothing */
	return(0);

read_ahead:
347 348
	/* Read all the suitable blocks within the area */

349
	if (inside_ibuf) {
350 351 352 353 354 355 356 357 358 359 360
		ibuf_mode = BUF_READ_IBUF_PAGES_ONLY;
	} else {
		ibuf_mode = BUF_READ_ANY_PAGE;
	}

	count = 0;

	for (i = low; i < high; i++) {
		/* It is only sensible to do read-ahead in the non-sync aio
		mode: hence FALSE as the first parameter */

361 362 363
		const page_id_t	cur_page_id(page_id.space(), i);

		if (!ibuf_bitmap_page(cur_page_id, page_size)) {
364
			count += buf_read_page_low(
365
				&err, false,
366 367
				IORequest::DO_NOT_WAKE,
				ibuf_mode,
Marko Mäkelä's avatar
Marko Mäkelä committed
368
				cur_page_id, page_size, false);
369

Marko Mäkelä's avatar
Marko Mäkelä committed
370
			switch (err) {
371
			case DB_SUCCESS:
Marko Mäkelä's avatar
Marko Mäkelä committed
372
			case DB_TABLESPACE_TRUNCATED:
373
			case DB_ERROR:
374 375
				break;
			case DB_TABLESPACE_DELETED:
Marko Mäkelä's avatar
Marko Mäkelä committed
376
				ib::info() << "Random readahead trying to"
377 378 379 380
					" access page " << cur_page_id
					<< " in nonexisting or"
					" being-dropped tablespace";
				break;
381 382
			default:
				ut_error;
unknown's avatar
unknown committed
383
			}
384 385 386 387 388 389
		}
	}

	/* In simulated aio we wake the aio handler threads only after
	queuing all aio requests, in native aio the following call does
	nothing: */
390

391 392
	os_aio_simulated_wake_handler_threads();

393 394 395 396 397
	if (count) {
		DBUG_PRINT("ib_buf", ("random read-ahead %u pages, %u:%u",
				      (unsigned) count,
				      (unsigned) page_id.space(),
				      (unsigned) page_id.page_no()));
398 399
	}

400 401 402 403 404
	/* Read ahead is considered one I/O operation for the purpose of
	LRU policy decision. */
	buf_LRU_stat_inc_io();

	buf_pool->stat.n_ra_pages_read_rnd += count;
405
	srv_stats.buf_pool_reads.add(count);
406 407 408
	return(count);
}

409
/** High-level function which reads a page asynchronously from a file to the
osku's avatar
osku committed
410 411
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
412
released by the i/o-handler thread.
413 414
@param[in]	page_id		page id
@param[in]	page_size	page size
Marko Mäkelä's avatar
Marko Mäkelä committed
415
@retval DB_SUCCESS if the page was read and is not corrupted,
416 417 418 419 420
@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted,
@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
after decryption normal page checksum does not match.
@retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */
dberr_t
osku's avatar
osku committed
421
buf_read_page(
422
	const page_id_t&	page_id,
Marko Mäkelä's avatar
Marko Mäkelä committed
423
	const page_size_t&	page_size)
osku's avatar
osku committed
424 425
{
	ulint		count;
426
	dberr_t		err = DB_SUCCESS;
osku's avatar
osku committed
427

428 429 430 431 432
	/* We do synchronous IO because our AIO completion code
	is sub-optimal. See buf_page_io_complete(), we have to
	acquire the buffer pool mutex before acquiring the block
	mutex, required for updating the page state. The acquire
	of the buffer pool mutex becomes an expensive bottleneck. */
osku's avatar
osku committed
433

434 435
	count = buf_read_page_low(
		&err, true,
Marko Mäkelä's avatar
Marko Mäkelä committed
436
		0, BUF_READ_ANY_PAGE, page_id, page_size, false);
osku's avatar
osku committed
437

438
	srv_stats.buf_pool_reads.add(count);
439

osku's avatar
osku committed
440
	if (err == DB_TABLESPACE_DELETED) {
Marko Mäkelä's avatar
Marko Mäkelä committed
441
		ib::info() << "trying to read page " << page_id
442
			<< " in nonexisting or being-dropped tablespace";
osku's avatar
osku committed
443 444
	}

445 446 447
	/* Increment number of I/O operations used for LRU policy. */
	buf_LRU_stat_inc_io();

448
	return(err);
osku's avatar
osku committed
449 450
}

451
/** High-level function which reads a page asynchronously from a file to the
452 453 454
buffer buf_pool if it is not already there. Sets the io_fix flag and sets
an exclusive lock on the buffer frame. The flag is cleared and the x-lock
released by the i/o-handler thread.
455 456
@param[in]	page_id		page id
@param[in]	page_size	page size
Marko Mäkelä's avatar
Marko Mäkelä committed
457
@param[in]	sync		true if synchronous aio is desired */
458
void
459 460 461 462
buf_read_page_background(
	const page_id_t&	page_id,
	const page_size_t&	page_size,
	bool			sync)
463 464
{
	ulint		count;
465
	dberr_t		err;
466

467 468 469 470
	count = buf_read_page_low(
		&err, sync,
		IORequest::DO_NOT_WAKE | IORequest::IGNORE_MISSING,
		BUF_READ_ANY_PAGE,
Marko Mäkelä's avatar
Marko Mäkelä committed
471
		page_id, page_size, false);
472

Marko Mäkelä's avatar
Marko Mäkelä committed
473
	switch (err) {
474
	case DB_SUCCESS:
Marko Mäkelä's avatar
Marko Mäkelä committed
475
	case DB_TABLESPACE_TRUNCATED:
476
	case DB_ERROR:
477 478
		break;
	case DB_TABLESPACE_DELETED:
Marko Mäkelä's avatar
Marko Mäkelä committed
479 480 481
		ib::info() << "trying to read page " << page_id
			<< " in the background"
			" in a non-existing or being-dropped tablespace";
482
		break;
483
	case DB_PAGE_CORRUPTED:
484
	case DB_DECRYPTION_FAILED:
Marko Mäkelä's avatar
Marko Mäkelä committed
485
		ib::error()
Marko Mäkelä's avatar
Marko Mäkelä committed
486 487
			<< "Background Page read failed to "
			"read or decrypt " << page_id;
488 489
		break;
	default:
Marko Mäkelä's avatar
Marko Mäkelä committed
490 491
		ib::fatal() << "Error " << err << " in background read of "
			<< page_id;
492
	}
493

494
	srv_stats.buf_pool_reads.add(count);
495 496 497 498 499 500 501 502 503

	/* We do not increment number of I/O operations used for LRU policy
	here (buf_LRU_stat_inc_io()). We use this in heuristics to decide
	about evicting uncompressed version of compressed pages from the
	buffer pool. Since this function is called from buffer pool load
	these IOs are deliberate and are not part of normal workload we can
	ignore these in our heuristics. */
}

504
/** Applies linear read-ahead if in the buf_pool the page is a border page of
osku's avatar
osku committed
505 506
a linear read-ahead area and all the pages in the area have been accessed.
Does not read any page if the read-ahead mechanism is not activated. Note
507
that the algorithm looks at the 'natural' adjacent successor and
osku's avatar
osku committed
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
predecessor of the page, which on the leaf level of a B-tree are the next
and previous page in the chain of leaves. To know these, the page specified
in (space, offset) must already be present in the buf_pool. Thus, the
natural way to use this function is to call it when a page in the buf_pool
is accessed the first time, calling this function just after it has been
bufferfixed.
NOTE 1: as this function looks at the natural predecessor and successor
fields on the page, what happens, if these are not initialized to any
sensible value? No problem, before applying read-ahead we check that the
area to read is within the span of the space, if not, read-ahead is not
applied. An uninitialized value may result in a useless read operation, but
only very improbably.
NOTE 2: the calling thread may own latches on pages: to avoid deadlocks this
function must be written such that it cannot end up waiting for these
latches!
NOTE 3: the calling thread must want access to the page given: this rule is
set to prevent unintended read-aheads performed by ibuf routines, a situation
525
which could result in a deadlock if the OS does not support asynchronous io.
526 527 528 529
@param[in]	page_id		page id; see NOTE 3 above
@param[in]	page_size	page size
@param[in]	inside_ibuf	TRUE if we are inside ibuf routine
@return number of page read requests issued */
osku's avatar
osku committed
530 531
ulint
buf_read_ahead_linear(
532 533 534
	const page_id_t&	page_id,
	const page_size_t&	page_size,
	ibool			inside_ibuf)
osku's avatar
osku committed
535
{
536
	buf_pool_t*	buf_pool = buf_pool_get(page_id);
537
	buf_page_t*	bpage;
osku's avatar
osku committed
538
	buf_frame_t*	frame;
539
	buf_page_t*	pred_bpage	= NULL;
osku's avatar
osku committed
540 541 542 543 544 545
	ulint		pred_offset;
	ulint		succ_offset;
	int		asc_or_desc;
	ulint		new_offset;
	ulint		fail_count;
	ulint		low, high;
546
	dberr_t		err = DB_SUCCESS;
osku's avatar
osku committed
547
	ulint		i;
548
	const ulint	buf_read_ahead_linear_area
549
		= BUF_READ_AHEAD_AREA(buf_pool);
550
	ulint		threshold;
551

552 553 554 555 556
	/* check if readahead is disabled */
	if (!srv_read_ahead_threshold) {
		return(0);
	}

557
	if (srv_startup_is_before_trx_rollback_phase) {
558 559
		/* No read-ahead to avoid thread deadlocks */
		return(0);
osku's avatar
osku committed
560 561
	}

562
	low  = (page_id.page_no() / buf_read_ahead_linear_area)
563
		* buf_read_ahead_linear_area;
564
	high = (page_id.page_no() / buf_read_ahead_linear_area + 1)
565
		* buf_read_ahead_linear_area;
osku's avatar
osku committed
566

567
	if ((page_id.page_no() != low) && (page_id.page_no() != high - 1)) {
osku's avatar
osku committed
568 569 570 571 572
		/* This is not a border page of the area: return */

		return(0);
	}

573
	if (ibuf_bitmap_page(page_id, page_size) || trx_sys_hdr_page(page_id)) {
574 575 576 577 578 579 580 581

		/* If it is an ibuf bitmap page or trx sys hdr, we do
		no read-ahead, as that could break the ibuf page access
		order */

		return(0);
	}

osku's avatar
osku committed
582 583 584
	/* Remember the tablespace version before we ask te tablespace size
	below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we
	do not try to read outside the bounds of the tablespace! */
585
	ulint	space_size;
osku's avatar
osku committed
586

587 588 589
	if (fil_space_t* space = fil_space_acquire(page_id.space())) {
		space_size = space->size;
		fil_space_release(space);
osku's avatar
osku committed
590

591 592 593 594 595
		if (high > space_size) {
			/* The area is not whole */
			return(0);
		}
	} else {
osku's avatar
osku committed
596 597 598
		return(0);
	}

599 600
	buf_pool_mutex_enter(buf_pool);

601 602
	if (buf_pool->n_pend_reads
	    > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
irana's avatar
irana committed
603
		buf_pool_mutex_exit(buf_pool);
osku's avatar
osku committed
604 605

		return(0);
606
	}
osku's avatar
osku committed
607 608 609 610 611 612 613

	/* Check that almost all pages in the area have been accessed; if
	offset == low, the accesses must be in a descending order, otherwise,
	in an ascending order. */

	asc_or_desc = 1;

614
	if (page_id.page_no() == low) {
osku's avatar
osku committed
615 616 617
		asc_or_desc = -1;
	}

618 619
	/* How many out of order accessed pages can we ignore
	when working out the access pattern for linear readahead */
620
	threshold = ut_min(static_cast<ulint>(64 - srv_read_ahead_threshold),
irana's avatar
irana committed
621
			   BUF_READ_AHEAD_AREA(buf_pool));
622

osku's avatar
osku committed
623 624 625
	fail_count = 0;

	for (i = low; i < high; i++) {
626 627
		bpage = buf_page_hash_get(buf_pool,
					  page_id_t(page_id.space(), i));
628

irana's avatar
irana committed
629
		if (bpage == NULL || !buf_page_is_accessed(bpage)) {
osku's avatar
osku committed
630 631 632
			/* Not accessed */
			fail_count++;

633
		} else if (pred_bpage) {
634 635 636 637 638 639 640 641 642 643 644
			/* Note that buf_page_is_accessed() returns
			the time of the first access.  If some blocks
			of the extent existed in the buffer pool at
			the time of a linear access pattern, the first
			access times may be nonmonotonic, even though
			the latest access times were linear.  The
			threshold (srv_read_ahead_factor) should help
			a little against this. */
			int res = ut_ulint_cmp(
				buf_page_is_accessed(bpage),
				buf_page_is_accessed(pred_bpage));
osku's avatar
osku committed
645
			/* Accesses not in the right order */
646 647 648
			if (res != 0 && res != asc_or_desc) {
				fail_count++;
			}
osku's avatar
osku committed
649 650
		}

651 652
		if (fail_count > threshold) {
			/* Too many failures: return */
irana's avatar
irana committed
653
			buf_pool_mutex_exit(buf_pool);
654 655
			return(0);
		}
osku's avatar
osku committed
656

657 658 659
		if (bpage && buf_page_is_accessed(bpage)) {
			pred_bpage = bpage;
		}
osku's avatar
osku committed
660 661 662 663 664
	}

	/* If we got this far, we know that enough pages in the area have
	been accessed in the right order: linear read-ahead can be sensible */

665
	bpage = buf_page_hash_get(buf_pool, page_id);
osku's avatar
osku committed
666

667
	if (bpage == NULL) {
irana's avatar
irana committed
668
		buf_pool_mutex_exit(buf_pool);
osku's avatar
osku committed
669 670 671 672

		return(0);
	}

673 674 675 676 677 678 679 680 681 682 683
	switch (buf_page_get_state(bpage)) {
	case BUF_BLOCK_ZIP_PAGE:
		frame = bpage->zip.data;
		break;
	case BUF_BLOCK_FILE_PAGE:
		frame = ((buf_block_t*) bpage)->frame;
		break;
	default:
		ut_error;
		break;
	}
684

osku's avatar
osku committed
685 686 687 688
	/* Read the natural predecessor and successor page addresses from
	the page; NOTE that because the calling thread may have an x-latch
	on the page, we do not acquire an s-latch on the page, this is to
	prevent deadlocks. Even if we read values which are nonsense, the
689
	algorithm will work. */
osku's avatar
osku committed
690 691 692 693

	pred_offset = fil_page_get_prev(frame);
	succ_offset = fil_page_get_next(frame);

irana's avatar
irana committed
694
	buf_pool_mutex_exit(buf_pool);
695

696 697
	if ((page_id.page_no() == low)
	    && (succ_offset == page_id.page_no() + 1)) {
osku's avatar
osku committed
698

699 700
		/* This is ok, we can continue */
		new_offset = pred_offset;
osku's avatar
osku committed
701

702 703
	} else if ((page_id.page_no() == high - 1)
		   && (pred_offset == page_id.page_no() - 1)) {
osku's avatar
osku committed
704

705 706
		/* This is ok, we can continue */
		new_offset = succ_offset;
osku's avatar
osku committed
707 708 709 710 711 712
	} else {
		/* Successor or predecessor not in the right order */

		return(0);
	}

713 714 715 716
	low  = (new_offset / buf_read_ahead_linear_area)
		* buf_read_ahead_linear_area;
	high = (new_offset / buf_read_ahead_linear_area + 1)
		* buf_read_ahead_linear_area;
osku's avatar
osku committed
717 718 719 720 721 722 723

	if ((new_offset != low) && (new_offset != high - 1)) {
		/* This is not a border page of the area: return */

		return(0);
	}

724
	if (high > space_size) {
osku's avatar
osku committed
725 726 727 728 729
		/* The area is not whole, return */

		return(0);
	}

730 731
	ulint	count = 0;

osku's avatar
osku committed
732 733
	/* If we got this far, read-ahead can be sensible: do it */

734
	ulint	ibuf_mode;
osku's avatar
osku committed
735

736
	ibuf_mode = inside_ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE;
osku's avatar
osku committed
737 738 739 740 741 742

	/* Since Windows XP seems to schedule the i/o handler thread
	very eagerly, and consequently it does not wait for the
	full read batch to be posted, we use special heuristics here */

	os_aio_simulated_put_read_threads_to_sleep();
743

osku's avatar
osku committed
744 745 746 747
	for (i = low; i < high; i++) {
		/* It is only sensible to do read-ahead in the non-sync
		aio mode: hence FALSE as the first parameter */

748 749 750
		const page_id_t	cur_page_id(page_id.space(), i);

		if (!ibuf_bitmap_page(cur_page_id, page_size)) {
751
			count += buf_read_page_low(
752
				&err, false,
753
				IORequest::DO_NOT_WAKE,
Marko Mäkelä's avatar
Marko Mäkelä committed
754
				ibuf_mode, cur_page_id, page_size, false);
755

Marko Mäkelä's avatar
Marko Mäkelä committed
756
			switch (err) {
757
			case DB_SUCCESS:
Marko Mäkelä's avatar
Marko Mäkelä committed
758
			case DB_TABLESPACE_TRUNCATED:
759
			case DB_TABLESPACE_DELETED:
760
			case DB_ERROR:
761
				break;
762
			case DB_PAGE_CORRUPTED:
763
			case DB_DECRYPTION_FAILED:
Marko Mäkelä's avatar
Marko Mäkelä committed
764
				ib::error() << "linear readahead failed to"
Marko Mäkelä's avatar
Marko Mäkelä committed
765
					" read or decrypt "
Marko Mäkelä's avatar
Marko Mäkelä committed
766
					<< page_id_t(page_id.space(), i);
767 768 769
				break;
			default:
				ut_error;
osku's avatar
osku committed
770 771 772 773 774 775 776
			}
		}
	}

	/* In simulated aio we wake the aio handler threads only after
	queuing all aio requests, in native aio the following call does
	nothing: */
777

osku's avatar
osku committed
778 779
	os_aio_simulated_wake_handler_threads();

780
	if (count) {
781 782
		DBUG_PRINT("ib_buf", ("linear read-ahead " ULINTPF " pages, "
				      "%u:%u",
783
				      count,
784 785
				      page_id.space(),
				      page_id.page_no()));
osku's avatar
osku committed
786 787
	}

788 789 790 791
	/* Read ahead is considered one I/O operation for the purpose of
	LRU policy decision. */
	buf_LRU_stat_inc_io();

792
	buf_pool->stat.n_ra_pages_read += count;
osku's avatar
osku committed
793 794 795
	return(count);
}

796
/********************************************************************//**
osku's avatar
osku committed
797 798 799 800 801 802
Issues read requests for pages which the ibuf module wants to read in, in
order to contract the insert buffer tree. Technically, this function is like
a read-ahead function. */
void
buf_read_ibuf_merge_pages(
/*======================*/
803
	bool		sync,		/*!< in: true if the caller
804 805 806 807
					wants this function to wait
					for the highest address page
					to get read in, before this
					function returns */
808 809
	const ulint*	space_ids,	/*!< in: array of space ids */
	const ulint*	page_nos,	/*!< in: array of page numbers
810 811 812
					to read, with the highest page
					number the last in the
					array */
813
	ulint		n_stored)	/*!< in: number of elements
814
					in the arrays */
osku's avatar
osku committed
815 816 817
{
#ifdef UNIV_IBUF_DEBUG
	ut_a(n_stored < UNIV_PAGE_SIZE);
818
#endif
osku's avatar
osku committed
819

820 821 822 823 824 825
	for (ulint i = 0; i < n_stored; i++) {
		bool			found;
		const page_size_t	page_size(fil_space_get_page_size(
			space_ids[i], &found));

		if (!found) {
Marko Mäkelä's avatar
Marko Mäkelä committed
826
tablespace_deleted:
827 828 829 830 831 832 833
			/* The tablespace was not found: remove all
			entries for it */
			ibuf_delete_for_discarded_space(space_ids[i]);
			while (i + 1 < n_stored
			       && space_ids[i + 1] == space_ids[i]) {
				i++;
			}
834 835
			continue;
		}
irana's avatar
irana committed
836

837 838 839 840
		const page_id_t	page_id(space_ids[i], page_nos[i]);

		buf_pool_t*	buf_pool = buf_pool_get(page_id);

irana's avatar
irana committed
841 842 843 844
		while (buf_pool->n_pend_reads
		       > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) {
			os_thread_sleep(500000);
		}
845

846
		dberr_t	err;
847

848 849 850 851
		buf_read_page_low(&err,
				  sync && (i + 1 == n_stored),
				  0,
				  BUF_READ_ANY_PAGE, page_id, page_size,
852
				  true, true /* ignore_missing_space */);
853 854 855

		switch(err) {
		case DB_SUCCESS:
Marko Mäkelä's avatar
Marko Mäkelä committed
856
		case DB_TABLESPACE_TRUNCATED:
857
		case DB_ERROR:
858 859
			break;
		case DB_TABLESPACE_DELETED:
Marko Mäkelä's avatar
Marko Mäkelä committed
860
			goto tablespace_deleted;
861
		case DB_PAGE_CORRUPTED:
862
		case DB_DECRYPTION_FAILED:
Marko Mäkelä's avatar
Marko Mäkelä committed
863
			ib::error() << "Failed to read or decrypt " << page_id
Marko Mäkelä's avatar
Marko Mäkelä committed
864
				<< " for change buffer merge";
865 866 867
			break;
		default:
			ut_error;
osku's avatar
osku committed
868 869
		}
	}
870

osku's avatar
osku committed
871 872
	os_aio_simulated_wake_handler_threads();

873 874 875 876
	if (n_stored) {
		DBUG_PRINT("ib_buf",
			   ("ibuf merge read-ahead %u pages, space %u",
			    unsigned(n_stored), unsigned(space_ids[0])));
osku's avatar
osku committed
877 878 879
	}
}

880 881 882 883 884 885 886
/** Issues read requests for pages which recovery wants to read in.
@param[in]	sync		true if the caller wants this function to wait
for the highest address page to get read in, before this function returns
@param[in]	space_id	tablespace id
@param[in]	page_nos	array of page numbers to read, with the
highest page number the last in the array
@param[in]	n_stored	number of page numbers in the array */
osku's avatar
osku committed
887 888
void
buf_read_recv_pages(
889 890 891 892
	bool		sync,
	ulint		space_id,
	const ulint*	page_nos,
	ulint		n_stored)
osku's avatar
osku committed
893
{
894
	fil_space_t*		space	= fil_space_get(space_id);
895

896 897
	if (space == NULL) {
		/* The tablespace is missing: do nothing */
898 899 900
		return;
	}

901 902 903
	fil_space_open_if_needed(space);

	const page_size_t	page_size(space->flags);
osku's avatar
osku committed
904

Marko Mäkelä's avatar
Marko Mäkelä committed
905
	for (ulint i = 0; i < n_stored; i++) {
906 907
		buf_pool_t*		buf_pool;
		const page_id_t	cur_page_id(space_id, page_nos[i]);
osku's avatar
osku committed
908

Marko Mäkelä's avatar
Marko Mäkelä committed
909
		ulint			count = 0;
osku's avatar
osku committed
910

911
		buf_pool = buf_pool_get(cur_page_id);
osku's avatar
osku committed
912 913 914
		while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {

			os_aio_simulated_wake_handler_threads();
inaam's avatar
inaam committed
915
			os_thread_sleep(10000);
osku's avatar
osku committed
916 917 918

			count++;

919 920 921 922 923 924 925
			if (!(count % 1000)) {

				ib::error()
					<< "Waited for " << count / 100
					<< " seconds for "
					<< buf_pool->n_pend_reads
					<< " pending reads";
osku's avatar
osku committed
926 927 928
			}
		}

Marko Mäkelä's avatar
Marko Mäkelä committed
929
		dberr_t err;
osku's avatar
osku committed
930

Marko Mäkelä's avatar
Marko Mäkelä committed
931
		if (sync && i + 1 == n_stored) {
932 933 934 935
			buf_read_page_low(
				&err, true,
				0,
				BUF_READ_ANY_PAGE,
Marko Mäkelä's avatar
Marko Mäkelä committed
936
				cur_page_id, page_size, true);
osku's avatar
osku committed
937
		} else {
938 939 940 941
			buf_read_page_low(
				&err, false,
				IORequest::DO_NOT_WAKE,
				BUF_READ_ANY_PAGE,
Marko Mäkelä's avatar
Marko Mäkelä committed
942
				cur_page_id, page_size, true);
943 944
		}

Marko Mäkelä's avatar
Marko Mäkelä committed
945 946
		if (err == DB_DECRYPTION_FAILED || err == DB_PAGE_CORRUPTED) {
			ib::error() << "Recovery failed to read or decrypt "
Marko Mäkelä's avatar
Marko Mäkelä committed
947
				<< cur_page_id;
osku's avatar
osku committed
948 949
		}
	}
950

osku's avatar
osku committed
951 952
	os_aio_simulated_wake_handler_threads();

953 954
	DBUG_PRINT("ib_buf", ("recovery read-ahead (%u pages)",
			      unsigned(n_stored)));
osku's avatar
osku committed
955
}