// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Malloc profiling.
// Patterned after tcmalloc's algorithms; shorter code.

package runtime
#include "runtime.h"
#include "arch_GOARCH.h"
#include "malloc.h"
#include "defs_GOOS_GOARCH.h"
#include "type.h"

// NOTE(rsc): Everything here could use cas if contention became an issue.
static Lock proflock;

// All memory allocations are local and do not escape outside of the profiler.
// The profiler is forbidden from referring to garbage-collected memory.

enum { MProf, BProf };  // profile types

// Per-call-stack profiling information.
// Lookup by hashing call stack into a linked-list hash table.
typedef struct Bucket Bucket;
struct Bucket
{
	Bucket	*next;	// next in hash list
	Bucket	*allnext;	// next in list of all mbuckets/bbuckets
	int32	typ;
	// Generally unions can break precise GC,
	// this one is fine because it does not contain pointers.
	union
	{
		struct  // typ == MProf
		{
			uintptr	allocs;
			uintptr	frees;
			uintptr	alloc_bytes;
			uintptr	free_bytes;
			uintptr	recent_allocs;  // since last gc
			uintptr	recent_frees;
			uintptr	recent_alloc_bytes;
			uintptr	recent_free_bytes;
		};
		struct  // typ == BProf
		{
			int64	count;
			int64	cycles;
		};
	};
	uintptr	hash;
	uintptr	nstk;
	uintptr	stk[1];
};
enum {
	BuckHashSize = 179999,
};
static Bucket **buckhash;
static Bucket *mbuckets;  // memory profile buckets
static Bucket *bbuckets;  // blocking profile buckets
static uintptr bucketmem;

// Return the bucket for stk[0:nstk], allocating new bucket if needed.
static Bucket*
stkbucket(int32 typ, uintptr *stk, int32 nstk, bool alloc)
{
	int32 i;
	uintptr h;
	Bucket *b;

	if(buckhash == nil) {
		buckhash = runtime·SysAlloc(BuckHashSize*sizeof buckhash[0]);
		if(buckhash == nil)
			runtime·throw("runtime: cannot allocate memory");
		mstats.buckhash_sys += BuckHashSize*sizeof buckhash[0];
	}

	// Hash stack.
	h = 0;
	for(i=0; i<nstk; i++) {
		h += stk[i];
		h += h<<10;
		h ^= h>>6;
	}
	h += h<<3;
	h ^= h>>11;

	i = h%BuckHashSize;
	for(b = buckhash[i]; b; b=b->next)
		if(b->typ == typ && b->hash == h && b->nstk == nstk &&
		   runtime·mcmp((byte*)b->stk, (byte*)stk, nstk*sizeof stk[0]) == 0)
			return b;

	if(!alloc)
		return nil;

	b = runtime·persistentalloc(sizeof *b + nstk*sizeof stk[0], 0);
	bucketmem += sizeof *b + nstk*sizeof stk[0];
	runtime·memmove(b->stk, stk, nstk*sizeof stk[0]);
	b->typ = typ;
	b->hash = h;
	b->nstk = nstk;
	b->next = buckhash[i];
	buckhash[i] = b;
	if(typ == MProf) {
		b->allnext = mbuckets;
		mbuckets = b;
	} else {
		b->allnext = bbuckets;
		bbuckets = b;
	}
	return b;
}

static void
MProf_GC(void)
{
	Bucket *b;

	for(b=mbuckets; b; b=b->allnext) {
		b->allocs += b->recent_allocs;
		b->frees += b->recent_frees;
		b->alloc_bytes += b->recent_alloc_bytes;
		b->free_bytes += b->recent_free_bytes;
		b->recent_allocs = 0;
		b->recent_frees = 0;
		b->recent_alloc_bytes = 0;
		b->recent_free_bytes = 0;
	}
}

// Record that a gc just happened: all the 'recent' statistics are now real.
void
runtime·MProf_GC(void)
{
	runtime·lock(&proflock);
	MProf_GC();
	runtime·unlock(&proflock);
}

// Map from pointer to Bucket* that allocated it.
// Three levels:
//	Linked-list hash table for top N-AddrHashShift bits.
//	Array index for next AddrDenseBits bits.
//	Linked list for next AddrHashShift-AddrDenseBits bits.
// This is more efficient than using a general map,
// because of the typical clustering of the pointer keys.

typedef struct AddrHash AddrHash;
typedef struct AddrEntry AddrEntry;

enum {
	AddrHashBits = 12,	// good for 4GB of used address space
	AddrHashShift = 20,	// each AddrHash knows about 1MB of address space
	AddrDenseBits = 8,	// good for a profiling rate of 4096 bytes
};

struct AddrHash
{
	AddrHash *next;	// next in top-level hash table linked list
	uintptr addr;	// addr>>20
	AddrEntry *dense[1<<AddrDenseBits];
};

struct AddrEntry
{
	AddrEntry *next;	// next in bottom-level linked list
	uint32 addr;
	Bucket *b;
};

static AddrHash **addrhash;	// points to (AddrHash*)[1<<AddrHashBits]
static AddrEntry *addrfree;
static uintptr addrmem;

// Multiplicative hash function:
// hashMultiplier is the bottom 32 bits of int((sqrt(5)-1)/2 * (1<<32)).
// This is a good multiplier as suggested in CLR, Knuth.  The hash
// value is taken to be the top AddrHashBits bits of the bottom 32 bits
// of the multiplied value.
enum {
	HashMultiplier = 2654435769U
};

// Set the bucket associated with addr to b.
static void
setaddrbucket(uintptr addr, Bucket *b)
{
	int32 i;
	uint32 h;
	AddrHash *ah;
	AddrEntry *e;

	h = (uint32)((addr>>AddrHashShift)*HashMultiplier) >> (32-AddrHashBits);
	for(ah=addrhash[h]; ah; ah=ah->next)
		if(ah->addr == (addr>>AddrHashShift))
			goto found;

	ah = runtime·persistentalloc(sizeof *ah, 0);
	addrmem += sizeof *ah;
	ah->next = addrhash[h];
	ah->addr = addr>>AddrHashShift;
	addrhash[h] = ah;

found:
	if((e = addrfree) == nil) {
		e = runtime·persistentalloc(64*sizeof *e, 0);
		addrmem += 64*sizeof *e;
		for(i=0; i+1<64; i++)
			e[i].next = &e[i+1];
		e[63].next = nil;
	}
	addrfree = e->next;
	e->addr = (uint32)~(addr & ((1<<AddrHashShift)-1));
	e->b = b;
	h = (addr>>(AddrHashShift-AddrDenseBits))&(nelem(ah->dense)-1);	// entry in dense is top 8 bits of low 20.
	e->next = ah->dense[h];
	ah->dense[h] = e;
}

// Get the bucket associated with addr and clear the association.
static Bucket*
getaddrbucket(uintptr addr)
{
	uint32 h;
	AddrHash *ah;
	AddrEntry *e, **l;
	Bucket *b;

	h = (uint32)((addr>>AddrHashShift)*HashMultiplier) >> (32-AddrHashBits);
	for(ah=addrhash[h]; ah; ah=ah->next)
		if(ah->addr == (addr>>AddrHashShift))
			goto found;
	return nil;

found:
	h = (addr>>(AddrHashShift-AddrDenseBits))&(nelem(ah->dense)-1);	// entry in dense is top 8 bits of low 20.
	for(l=&ah->dense[h]; (e=*l) != nil; l=&e->next) {
		if(e->addr == (uint32)~(addr & ((1<<AddrHashShift)-1))) {
			*l = e->next;
			b = e->b;
			e->next = addrfree;
			addrfree = e;
			return b;
		}
	}
	return nil;
}

// Called by malloc to record a profiled block.
void
runtime·MProf_Malloc(void *p, uintptr size)
{
	int32 nstk;
	uintptr stk[32];
	Bucket *b;

	if(m->nomemprof > 0)
		return;

	m->nomemprof++;
	nstk = runtime·callers(1, stk, 32);
	runtime·lock(&proflock);
	b = stkbucket(MProf, stk, nstk, true);
	b->recent_allocs++;
	b->recent_alloc_bytes += size;
	setaddrbucket((uintptr)p, b);
	runtime·unlock(&proflock);
	m->nomemprof--;
}

// Called when freeing a profiled block.
void
runtime·MProf_Free(void *p, uintptr size)
{
	Bucket *b;

	if(m->nomemprof > 0)
		return;

	m->nomemprof++;
	runtime·lock(&proflock);
	b = getaddrbucket((uintptr)p);
	if(b != nil) {
		b->recent_frees++;
		b->recent_free_bytes += size;
	}
	runtime·unlock(&proflock);
	m->nomemprof--;
}

int64 runtime·blockprofilerate;  // in CPU ticks

void
runtime·SetBlockProfileRate(intgo rate)
{
	runtime·atomicstore64((uint64*)&runtime·blockprofilerate, rate * runtime·tickspersecond() / (1000*1000*1000));
}

void
runtime·blockevent(int64 cycles, int32 skip)
{
	int32 nstk;
	int64 rate;
	uintptr stk[32];
	Bucket *b;

	if(cycles <= 0)
		return;
	rate = runtime·atomicload64((uint64*)&runtime·blockprofilerate);
	if(rate <= 0 || (rate > cycles && runtime·fastrand1()%rate > cycles))
		return;

	nstk = runtime·callers(skip, stk, 32);
	runtime·lock(&proflock);
	b = stkbucket(BProf, stk, nstk, true);
	b->count++;
	b->cycles += cycles;
	runtime·unlock(&proflock);
}

// Go interface to profile data.  (Declared in debug.go)

// Must match MemProfileRecord in debug.go.
typedef struct Record Record;
struct Record {
	int64 alloc_bytes, free_bytes;
	int64 alloc_objects, free_objects;
	uintptr stk[32];
};

// Write b's data to r.
static void
record(Record *r, Bucket *b)
{
	int32 i;

	r->alloc_bytes = b->alloc_bytes;
	r->free_bytes = b->free_bytes;
	r->alloc_objects = b->allocs;
	r->free_objects = b->frees;
	for(i=0; i<b->nstk && i<nelem(r->stk); i++)
		r->stk[i] = b->stk[i];
	for(; i<nelem(r->stk); i++)
		r->stk[i] = 0;
}

func MemProfile(p Slice, include_inuse_zero bool) (n int, ok bool) {
	Bucket *b;
	Record *r;
	bool clear;

	runtime·lock(&proflock);
	n = 0;
	clear = true;
	for(b=mbuckets; b; b=b->allnext) {
		if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
			n++;
		if(b->allocs != 0 || b->frees != 0)
			clear = false;
	}
	if(clear) {
		// Absolutely no data, suggesting that a garbage collection
		// has not yet happened. In order to allow profiling when
		// garbage collection is disabled from the beginning of execution,
		// accumulate stats as if a GC just happened, and recount buckets.
		MProf_GC();
		n = 0;
		for(b=mbuckets; b; b=b->allnext)
			if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
				n++;
	}
	ok = false;
	if(n <= p.len) {
		ok = true;
		r = (Record*)p.array;
		for(b=mbuckets; b; b=b->allnext)
			if(include_inuse_zero || b->alloc_bytes != b->free_bytes)
				record(r++, b);
	}
	runtime·unlock(&proflock);
}

// Must match BlockProfileRecord in debug.go.
typedef struct BRecord BRecord;
struct BRecord {
	int64 count;
	int64 cycles;
	uintptr stk[32];
};

func BlockProfile(p Slice) (n int, ok bool) {
	Bucket *b;
	BRecord *r;
	int32 i;

	runtime·lock(&proflock);
	n = 0;
	for(b=bbuckets; b; b=b->allnext)
		n++;
	ok = false;
	if(n <= p.len) {
		ok = true;
		r = (BRecord*)p.array;
		for(b=bbuckets; b; b=b->allnext, r++) {
			r->count = b->count;
			r->cycles = b->cycles;
			for(i=0; i<b->nstk && i<nelem(r->stk); i++)
				r->stk[i] = b->stk[i];
			for(; i<nelem(r->stk); i++)
				r->stk[i] = 0;			
		}
	}
	runtime·unlock(&proflock);
}

// Must match StackRecord in debug.go.
typedef struct TRecord TRecord;
struct TRecord {
	uintptr stk[32];
};

func ThreadCreateProfile(p Slice) (n int, ok bool) {
	TRecord *r;
	M *first, *mp;
	
	first = runtime·atomicloadp(&runtime·allm);
	n = 0;
	for(mp=first; mp; mp=mp->alllink)
		n++;
	ok = false;
	if(n <= p.len) {
		ok = true;
		r = (TRecord*)p.array;
		for(mp=first; mp; mp=mp->alllink) {
			runtime·memmove(r->stk, mp->createstack, sizeof r->stk);
			r++;
		}
	}
}

func Stack(b Slice, all bool) (n int) {
	uintptr pc, sp;
	
	sp = runtime·getcallersp(&b);
	pc = (uintptr)runtime·getcallerpc(&b);

	if(all) {
		runtime·semacquire(&runtime·worldsema);
		m->gcing = 1;
		runtime·stoptheworld();
	}

	if(b.len == 0)
		n = 0;
	else{
		g->writebuf = (byte*)b.array;
		g->writenbuf = b.len;
		runtime·goroutineheader(g);
		runtime·traceback(pc, sp, 0, g);
		if(all)
			runtime·tracebackothers(g);
		n = b.len - g->writenbuf;
		g->writebuf = nil;
		g->writenbuf = 0;
	}
	
	if(all) {
		m->gcing = 0;
		runtime·semrelease(&runtime·worldsema);
		runtime·starttheworld();
	}
}

static void
saveg(uintptr pc, uintptr sp, G *gp, TRecord *r)
{
	int32 n;
	
	n = runtime·gentraceback((uintptr)pc, (uintptr)sp, 0, gp, 0, r->stk, nelem(r->stk), nil, nil);
	if(n < nelem(r->stk))
		r->stk[n] = 0;
}

func GoroutineProfile(b Slice) (n int, ok bool) {
	uintptr pc, sp;
	TRecord *r;
	G *gp;
	
	sp = runtime·getcallersp(&b);
	pc = (uintptr)runtime·getcallerpc(&b);
	
	ok = false;
	n = runtime·gcount();
	if(n <= b.len) {
		runtime·semacquire(&runtime·worldsema);
		m->gcing = 1;
		runtime·stoptheworld();

		n = runtime·gcount();
		if(n <= b.len) {
			ok = true;
			r = (TRecord*)b.array;
			saveg(pc, sp, g, r++);
			for(gp = runtime·allg; gp != nil; gp = gp->alllink) {
				if(gp == g || gp->status == Gdead)
					continue;
				saveg(gp->sched.pc, gp->sched.sp, gp, r++);
			}
		}
	
		m->gcing = 0;
		runtime·semrelease(&runtime·worldsema);
		runtime·starttheworld();
	}
}

void
runtime·mprofinit(void)
{
	addrhash = runtime·persistentalloc((1<<AddrHashBits)*sizeof *addrhash, 0);
}