/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; version 2 of the License.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */

#include "maria_def.h"
#ifdef HAVE_SYS_MMAN_H
#include <sys/mman.h>
#endif
#include "ma_blockrec.h"

static void maria_extra_keyflag(MARIA_HA *info,
                                enum ha_extra_function function);

/**
   @brief Set options and buffers to optimize table handling

   @param  name             table's name
   @param  info             open table
   @param  function         operation
   @param  extra_arg        Pointer to extra argument (normally pointer to
                            ulong); used when function is one of:
                            HA_EXTRA_WRITE_CACHE
                            HA_EXTRA_CACHE

   @return Operation status
     @retval 0      ok
     @retval !=0    error
*/

int maria_extra(MARIA_HA *info, enum ha_extra_function function,
                void *extra_arg)
{
  int error= 0;
  ulong cache_size;
  MARIA_SHARE *share= info->s;
  my_bool block_records= share->data_file_type == BLOCK_RECORD;
  DBUG_ENTER("maria_extra");
  DBUG_PRINT("enter",("function: %d",(int) function));

  switch (function) {
  case HA_EXTRA_RESET_STATE:		/* Reset state (don't free buffers) */
    info->lastinx= 0;			/* Use first index as def */
    info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR;
    info->page_changed= 1;
					/* Next/prev gives first/last */
    if (info->opt_flag & READ_CACHE_USED)
    {
      reinit_io_cache(&info->rec_cache,READ_CACHE,0,
		      (pbool) (info->lock_type != F_UNLCK),
		      (pbool) test(info->update & HA_STATE_ROW_CHANGED)
		      );
    }
    info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND |
		   HA_STATE_PREV_FOUND);
    break;
  case HA_EXTRA_CACHE:
    if (block_records)
      break;                                    /* Not supported */

    if (info->lock_type == F_UNLCK &&
	(share->options & HA_OPTION_PACK_RECORD))
    {
      error= 1;			/* Not possibly if not locked */
      my_errno= EACCES;
      break;
    }
    if (info->s->file_map) /* Don't use cache if mmap */
      break;
#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
    if ((share->options & HA_OPTION_COMPRESS_RECORD))
    {
      pthread_mutex_lock(&share->intern_lock);
      if (_ma_memmap_file(info))
      {
	/* We don't nead MADV_SEQUENTIAL if small file */
	madvise((char*) share->file_map, share->state.state.data_file_length,
		share->state.state.data_file_length <= RECORD_CACHE_SIZE*16 ?
		MADV_RANDOM : MADV_SEQUENTIAL);
	pthread_mutex_unlock(&share->intern_lock);
	break;
      }
      pthread_mutex_unlock(&share->intern_lock);
    }
#endif
    if (info->opt_flag & WRITE_CACHE_USED)
    {
      info->opt_flag&= ~WRITE_CACHE_USED;
      if ((error= end_io_cache(&info->rec_cache)))
	break;
    }
    if (!(info->opt_flag &
	  (READ_CACHE_USED | WRITE_CACHE_USED | MEMMAP_USED)))
    {
      cache_size= (extra_arg ? *(ulong*) extra_arg :
		   my_default_record_cache_size);
      if (!(init_io_cache(&info->rec_cache, info->dfile.file,
			 (uint) min(share->state.state.data_file_length+1,
				    cache_size),
			  READ_CACHE,0L,(pbool) (info->lock_type != F_UNLCK),
			  MYF(share->write_flag & MY_WAIT_IF_FULL))))
      {
	info->opt_flag|= READ_CACHE_USED;
	info->update&=   ~HA_STATE_ROW_CHANGED;
      }
      if (share->non_transactional_concurrent_insert)
	info->rec_cache.end_of_file= info->state->data_file_length;
    }
    break;
  case HA_EXTRA_REINIT_CACHE:
    if (info->opt_flag & READ_CACHE_USED)
    {
      reinit_io_cache(&info->rec_cache, READ_CACHE, info->cur_row.nextpos,
		      (pbool) (info->lock_type != F_UNLCK),
		      (pbool) test(info->update & HA_STATE_ROW_CHANGED));
      info->update&= ~HA_STATE_ROW_CHANGED;
      if (share->non_transactional_concurrent_insert)
	info->rec_cache.end_of_file= info->state->data_file_length;
    }
    break;
  case HA_EXTRA_WRITE_CACHE:
    if (info->lock_type == F_UNLCK)
    {
      error= 1;                        	/* Not possibly if not locked */
      break;
    }
    if (block_records)
      break;                            /* Not supported */

    cache_size= (extra_arg ? *(ulong*) extra_arg :
		 my_default_record_cache_size);
    if (!(info->opt_flag &
	  (READ_CACHE_USED | WRITE_CACHE_USED | OPT_NO_ROWS)) &&
	!share->state.header.uniques)
      if (!(init_io_cache(&info->rec_cache, info->dfile.file, cache_size,
			 WRITE_CACHE,share->state.state.data_file_length,
			  (pbool) (info->lock_type != F_UNLCK),
			  MYF(share->write_flag & MY_WAIT_IF_FULL))))
      {
	info->opt_flag|= WRITE_CACHE_USED;
	info->update&=   ~(HA_STATE_ROW_CHANGED |
                           HA_STATE_WRITE_AT_END |
                           HA_STATE_EXTEND_BLOCK);
      }
    break;
  case HA_EXTRA_PREPARE_FOR_UPDATE:
    if (info->s->data_file_type != DYNAMIC_RECORD)
      break;
    /* Remove read/write cache if dynamic rows */
  case HA_EXTRA_NO_CACHE:
    if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
    {
      info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
      error= end_io_cache(&info->rec_cache);
      /* Sergei will insert full text index caching here */
    }
#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
    if (info->opt_flag & MEMMAP_USED)
      madvise((char*) share->file_map, share->state.state.data_file_length,
              MADV_RANDOM);
#endif
    break;
  case HA_EXTRA_FLUSH_CACHE:
    if (info->opt_flag & WRITE_CACHE_USED)
    {
      if ((error= flush_io_cache(&info->rec_cache)))
      {
        maria_print_error(info->s, HA_ERR_CRASHED);
	maria_mark_crashed(info);			/* Fatal error found */
      }
    }
    break;
  case HA_EXTRA_NO_READCHECK:
    info->opt_flag&= ~READ_CHECK_USED;		/* No readcheck */
    break;
  case HA_EXTRA_READCHECK:
    info->opt_flag|= READ_CHECK_USED;
    break;
  case HA_EXTRA_KEYREAD:			/* Read only keys to record */
  case HA_EXTRA_REMEMBER_POS:
    info->opt_flag|= REMEMBER_OLD_POS;
    bmove((uchar*) info->last_key.data + share->base.max_key_length*2,
	  (uchar*) info->last_key.data,
          info->last_key.data_length + info->last_key.ref_length);
    info->save_update=	info->update;
    info->save_lastinx= info->lastinx;
    info->save_lastpos= info->cur_row.lastpos;
    info->save_lastkey_data_length= info->last_key.data_length;
    info->save_lastkey_ref_length= info->last_key.ref_length;
    if (function == HA_EXTRA_REMEMBER_POS)
      break;
    /* fall through */
  case HA_EXTRA_KEYREAD_CHANGE_POS:
    info->opt_flag|= KEY_READ_USED;
    info->read_record= _ma_read_key_record;
    break;
  case HA_EXTRA_NO_KEYREAD:
  case HA_EXTRA_RESTORE_POS:
    if (info->opt_flag & REMEMBER_OLD_POS)
    {
      bmove((uchar*) info->last_key.data,
	    (uchar*) info->last_key.data + share->base.max_key_length*2,
	    info->save_lastkey_data_length + info->save_lastkey_ref_length);
      info->update=	info->save_update | HA_STATE_WRITTEN;
      info->lastinx=	info->save_lastinx;
      info->cur_row.lastpos= info->save_lastpos;
      info->last_key.data_length= info->save_lastkey_data_length;
      info->last_key.ref_length= info->save_lastkey_ref_length;
      info->last_key.flag= 0;
    }
    info->read_record=	share->read_record;
    info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS);
    break;
  case HA_EXTRA_NO_USER_CHANGE: /* Database is somehow locked agains changes */
    info->lock_type= F_EXTRA_LCK; /* Simulate as locked */
    break;
  case HA_EXTRA_WAIT_LOCK:
    info->lock_wait= 0;
    break;
  case HA_EXTRA_NO_WAIT_LOCK:
    info->lock_wait= MY_SHORT_WAIT;
    break;
  case HA_EXTRA_NO_KEYS:
    /* we're going to modify pieces of the state, stall Checkpoint */
    pthread_mutex_lock(&share->intern_lock);
    if (info->lock_type == F_UNLCK)
    {
      pthread_mutex_unlock(&share->intern_lock);
      error= 1;					/* Not possibly if not lock */
      break;
    }
    if (maria_is_any_key_active(share->state.key_map))
    {
      MARIA_KEYDEF *key= share->keyinfo;
      uint i;
      for (i =0 ; i < share->base.keys ; i++,key++)
      {
        if (!(key->flag & HA_NOSAME) && info->s->base.auto_key != i+1)
        {
          maria_clear_key_active(share->state.key_map, i);
          info->update|= HA_STATE_CHANGED;
        }
      }

      if (!share->changed)
      {
	share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED;
	share->changed= 1;			/* Update on close */
	if (!share->global_changed)
	{
	  share->global_changed= 1;
	  share->state.open_count++;
	}
      }
      if (!share->now_transactional)
        share->state.state= *info->state;
      /*
        That state write to disk must be done, even for transactional tables;
        indeed the table's share is going to be lost (there was a
        HA_EXTRA_FORCE_REOPEN before, which set share->last_version to
        0), and so the only way it leaves information (share->state.key_map)
        for the posterity is by writing it to disk.
      */
      DBUG_ASSERT(!maria_in_recovery);
      error= _ma_state_info_write(share, 1|2);
    }
    pthread_mutex_unlock(&share->intern_lock);
    break;
  case HA_EXTRA_FORCE_REOPEN:
    /*
      MySQL uses this case after it has closed all other instances
      of this table.
      We however do a flush here for additional safety.
    */
    /** @todo consider porting these flush-es to MyISAM */
    DBUG_ASSERT(share->reopen == 1);
    error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
                                 FLUSH_FORCE_WRITE, FLUSH_FORCE_WRITE);
    if (!error && share->changed)
    {
      pthread_mutex_lock(&share->intern_lock);
      if (!(error= _ma_state_info_write(share, 1|2)))
        share->changed= 0;
      pthread_mutex_unlock(&share->intern_lock);
    }
    pthread_mutex_lock(&THR_LOCK_maria);
    pthread_mutex_lock(&share->intern_lock); /* protect against Checkpoint */
    /* this makes the share not be re-used next time the table is opened */
    share->last_version= 0L;			/* Impossible version */
    pthread_mutex_unlock(&share->intern_lock);
    pthread_mutex_unlock(&THR_LOCK_maria);
    break;
  case HA_EXTRA_PREPARE_FOR_DROP:
  case HA_EXTRA_PREPARE_FOR_RENAME:
  {
    my_bool do_flush= test(function != HA_EXTRA_PREPARE_FOR_DROP);
    enum flush_type type;
    pthread_mutex_lock(&THR_LOCK_maria);
    /*
      This share, to have last_version=0, needs to save all its data/index
      blocks to disk if this is not for a DROP TABLE. Otherwise they would be
      invisible to future openers; and they could even go to disk late and
      cancel the work of future openers.
    */
    if (info->lock_type != F_UNLCK && !info->was_locked)
    {
      info->was_locked= info->lock_type;
      if (maria_lock_database(info, F_UNLCK))
        error= my_errno;
      info->lock_type= F_UNLCK;
    }
    if (share->kfile.file >= 0)
      _ma_decrement_open_count(info);
    pthread_mutex_lock(&share->intern_lock);
    if (info->trn)
    {
      _ma_remove_table_from_trnman(share, info->trn);
      /* Ensure we don't point to the deleted data in trn */
      info->state= &share->state.state;
    }

    type= do_flush ? FLUSH_RELEASE : FLUSH_IGNORE_CHANGED;
    if (_ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
                              type, type))
    {
      error=my_errno;
      share->changed= 1;
    }
    if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
    {
      info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
      if (end_io_cache(&info->rec_cache))
        error= 1;
    }
    if (share->kfile.file >= 0)
    {
      if (do_flush)
      {
        /* Save the state so that others can find it from disk. */
        if (_ma_state_info_write(share, 1 | 2) ||
            my_sync(share->kfile.file, MYF(0)))
          error= my_errno;
        else
          share->changed= 0;
      }
      else
      {
        /* be sure that state is not tried for write as file may be closed */
        share->changed= 0;
      }
    }
    if (share->data_file_type == BLOCK_RECORD &&
        share->bitmap.file.file >= 0)
    {
      if (do_flush && my_sync(share->bitmap.file.file, MYF(0)))
        error= my_errno;
    }
    /* For protection against Checkpoint, we set under intern_lock: */
    share->last_version= 0L;			/* Impossible version */
    pthread_mutex_unlock(&share->intern_lock);
    pthread_mutex_unlock(&THR_LOCK_maria);
    break;
  }
  case HA_EXTRA_FLUSH:
    if (!share->temporary)
      error= _ma_flush_table_files(info, MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX,
                                   FLUSH_KEEP, FLUSH_KEEP);
#ifdef HAVE_PWRITE
    _ma_decrement_open_count(info);
#endif
    if (share->not_flushed)
    {
      share->not_flushed= 0;
      if (_ma_sync_table_files(info))
	error= my_errno;
      if (error)
      {
	share->changed= 1;
        maria_print_error(info->s, HA_ERR_CRASHED);
	maria_mark_crashed(info);			/* Fatal error found */
      }
    }
    break;
  case HA_EXTRA_NORMAL:				/* Theese isn't in use */
    info->quick_mode= 0;
    break;
  case HA_EXTRA_QUICK:
    info->quick_mode= 1;
    break;
  case HA_EXTRA_NO_ROWS:
    if (!share->state.header.uniques)
      info->opt_flag|= OPT_NO_ROWS;
    break;
  case HA_EXTRA_PRELOAD_BUFFER_SIZE:
    info->preload_buff_size= *((ulong *) extra_arg);
    break;
  case HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
  case HA_EXTRA_CHANGE_KEY_TO_DUP:
    maria_extra_keyflag(info, function);
    break;
  case HA_EXTRA_MMAP:
#ifdef HAVE_MMAP
    if (block_records)
      break;                                    /* Not supported */
    pthread_mutex_lock(&share->intern_lock);
    /*
      Memory map the data file if it is not already mapped. It is safe
      to memory map a file while other threads are using file I/O on it.
      Assigning a new address to a function pointer is an atomic
      operation. intern_lock prevents that two or more mappings are done
      at the same time.
    */
    if (!share->file_map)
    {
      if (_ma_dynmap_file(info, share->state.state.data_file_length))
      {
        DBUG_PRINT("warning",("mmap failed: errno: %d",errno));
        error= my_errno= errno;
      }
      else
      {
        share->file_read=  _ma_mmap_pread;
        share->file_write= _ma_mmap_pwrite;
      }
    }
    pthread_mutex_unlock(&share->intern_lock);
#endif
    break;
  case HA_EXTRA_MARK_AS_LOG_TABLE:
    pthread_mutex_lock(&share->intern_lock);
    share->is_log_table= TRUE;
    pthread_mutex_unlock(&share->intern_lock);
    break;
  case HA_EXTRA_KEY_CACHE:
  case HA_EXTRA_NO_KEY_CACHE:
  default:
    break;
  }
  DBUG_RETURN(error);
} /* maria_extra */


/*
  Start/Stop Inserting Duplicates Into a Table, WL#1648.
*/

static void maria_extra_keyflag(MARIA_HA *info,
                                enum ha_extra_function function)
{
  uint  idx;

  for (idx= 0; idx< info->s->base.keys; idx++)
  {
    switch (function) {
    case HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
      info->s->keyinfo[idx].flag|= HA_NOSAME;
      break;
    case HA_EXTRA_CHANGE_KEY_TO_DUP:
      info->s->keyinfo[idx].flag&= ~(HA_NOSAME);
      break;
    default:
      break;
    }
  }
}


int maria_reset(MARIA_HA *info)
{
  int error= 0;
  MARIA_SHARE *share= info->s;
  DBUG_ENTER("maria_reset");
  /*
    Free buffers and reset the following flags:
    EXTRA_CACHE, EXTRA_WRITE_CACHE, EXTRA_KEYREAD, EXTRA_QUICK

    If the row buffer cache is large (for dynamic tables), reduce it
    to save memory.
  */
  if (info->opt_flag & (READ_CACHE_USED | WRITE_CACHE_USED))
  {
    info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
    error= end_io_cache(&info->rec_cache);
  }
  /* Free memory used for keeping blobs */
  if (share->base.blobs)
  {
    if (info->rec_buff_size > share->base.default_rec_buff_size)
    {
      info->rec_buff_size= 1;                 /* Force realloc */
      _ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size,
                       share->base.default_rec_buff_size);
    }
    if (info->blob_buff_size > MARIA_SMALL_BLOB_BUFFER)
    {
      info->blob_buff_size= 1;                 /* Force realloc */
      _ma_alloc_buffer(&info->blob_buff, &info->blob_buff_size,
                       MARIA_SMALL_BLOB_BUFFER);
    }
  }
#if defined(HAVE_MMAP) && defined(HAVE_MADVISE)
  if (info->opt_flag & MEMMAP_USED)
    madvise((char*) share->file_map, share->state.state.data_file_length,
            MADV_RANDOM);
#endif
  info->opt_flag&= ~(KEY_READ_USED | REMEMBER_OLD_POS);
  info->quick_mode= 0;
  info->lastinx= 0;			/* Use first index as def */
  info->last_search_keypage= info->cur_row.lastpos= HA_OFFSET_ERROR;
  info->page_changed= 1;
  info->update= ((info->update & HA_STATE_CHANGED) | HA_STATE_NEXT_FOUND |
                 HA_STATE_PREV_FOUND);
  DBUG_RETURN(error);
}


int _ma_sync_table_files(const MARIA_HA *info)
{
  return (my_sync(info->dfile.file, MYF(MY_WME)) ||
          my_sync(info->s->kfile.file, MYF(MY_WME)));
}


/**
   @brief flushes the data and/or index file of a table

   This is useful when one wants to read a table using OS syscalls (like
   my_copy()) and first wants to be sure that MySQL-level caches go down to
   the OS so that OS syscalls can see all data. It can flush rec_cache,
   bitmap, pagecache of data file, pagecache of index file.

   @param  info                table
   @param  flush_data_or_index one or two of these flags:
                               MARIA_FLUSH_DATA, MARIA_FLUSH_INDEX
   @param  flush_type_for_data
   @param  flush_type_for_index

   @note does not sync files (@see _ma_sync_table_files()).
   @note Progressively this function will be used in all places where we flush
   the index but not the data file (probable bugs).

   @return Operation status
     @retval 0      OK
     @retval 1      Error
*/

int _ma_flush_table_files(MARIA_HA *info, uint flush_data_or_index,
                          enum flush_type flush_type_for_data,
                          enum flush_type flush_type_for_index)
{
  int error= 0;
  MARIA_SHARE *share= info->s;
  /* flush data file first because it's more critical */
  if (flush_data_or_index & MARIA_FLUSH_DATA)
  {
    if ((info->opt_flag & WRITE_CACHE_USED) &&
        flush_type_for_data != FLUSH_IGNORE_CHANGED &&
        flush_io_cache(&info->rec_cache))
      error= 1;
    if (share->data_file_type == BLOCK_RECORD)
    {
      if (flush_type_for_data != FLUSH_IGNORE_CHANGED)
      {
        if (_ma_bitmap_flush(share))
          error= 1;
      }
      else
        info->s->bitmap.changed= 0;
      if (flush_pagecache_blocks(share->pagecache, &info->dfile,
                                 flush_type_for_data))
        error= 1;
    }
  }
  if ((flush_data_or_index & MARIA_FLUSH_INDEX) &&
      flush_pagecache_blocks(share->pagecache, &share->kfile,
                             flush_type_for_index))
    error= 1;
  if (!error)
    return 0;

  maria_print_error(info->s, HA_ERR_CRASHED);
  maria_mark_crashed(info);
  return 1;
}