/* frame_malloc.c -*-C-*- * ************************************************************************* * * @copyright * Copyright (C) 2009-2013, Intel Corporation * All rights reserved. * * @copyright * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * @copyright * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. **************************************************************************/ #include "frame_malloc.h" #include "bug.h" #include "local_state.h" #include "cilk_malloc.h" #ifndef __VXWORKS__ #include #endif /* #define USE_MMAP 1 */ #if USE_MMAP #define __USE_MISC 1 #include #include #endif // Define to fill the stack frame header with the fill character when pushing // it on a free list. Note that this should be #ifdef'd out when checked in! #ifdef _DEBUG #define HEADER_FILL_CHAR 0xbf #endif // HEADER_FILL_CHAR should not be defined when checked in, so put out a warning // message if this is a release build #if defined(NDEBUG) && defined (HEADER_FILL_CHAR) #pragma message ("Warning: HEADER_FILL_CHAR defined for a release build") #endif static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size); #ifndef _WIN32 const unsigned short __cilkrts_bucket_sizes[FRAME_MALLOC_NBUCKETS] = { 64, 128, 256, 512, 1024, 2048 }; #define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) __cilkrts_bucket_sizes[bucket] /* threshold above which we use slow malloc */ #define FRAME_MALLOC_MAX_SIZE 2048 #else // _WIN32 /* Note that this must match the implementation of framesz_to_bucket in * asmilator/layout.ml! */ #define FRAME_MALLOC_BUCKET_TO_SIZE(bucket) ((size_t)(64 << (bucket))) /* threshold above which we use slow malloc */ #define FRAME_MALLOC_MAX_SIZE \ FRAME_MALLOC_BUCKET_TO_SIZE(FRAME_MALLOC_NBUCKETS - 1) #endif // _WIN32 /* utility procedures */ static void push(struct free_list **b, struct free_list *p) { #ifdef HEADER_FILL_CHAR memset (p, HEADER_FILL_CHAR, FRAME_MALLOC_BUCKET_TO_SIZE(0)); #endif /* cons! onto free list */ p->cdr = *b; *b = p; } static struct free_list *pop(struct free_list **b) { struct free_list *p = *b; if (p) *b = p->cdr; return p; } /************************************************************* global allocator: *************************************************************/ /* request slightly less than 2^K from the OS, which after malloc overhead and alignment should end up filling each VM page almost completely. 128 is a guess of the total malloc overhead and cache line alignment */ #define FRAME_MALLOC_CHUNK (32 * 1024 - 128) /** Implements linked list of frames */ struct pool_cons { char *p; /**< This element of the list */ struct pool_cons *cdr; /**< Remainder of the list */ }; static void extend_global_pool(global_state_t *g) { /* FIXME: memalign to a cache line? */ struct pool_cons *c = (struct pool_cons *)__cilkrts_malloc(sizeof(*c)); g->frame_malloc.pool_begin = (char *)__cilkrts_malloc((size_t)FRAME_MALLOC_CHUNK); g->frame_malloc.pool_end = g->frame_malloc.pool_begin + FRAME_MALLOC_CHUNK; g->frame_malloc.allocated_from_os += FRAME_MALLOC_CHUNK; c->p = g->frame_malloc.pool_begin; c->cdr = g->frame_malloc.pool_list; g->frame_malloc.pool_list = c; } /* the size is already canonicalized at this point */ static struct free_list *global_alloc(global_state_t *g, int bucket) { struct free_list *mem; size_t size; CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS); size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); g->frame_malloc.allocated_from_global_pool += size; if (!(mem = pop(&g->frame_malloc.global_free_list[bucket]))) { CILK_ASSERT(g->frame_malloc.pool_begin <= g->frame_malloc.pool_end); if (g->frame_malloc.pool_begin + size > g->frame_malloc.pool_end) { /* We waste the fragment of pool. */ g->frame_malloc.wasted += g->frame_malloc.pool_end - g->frame_malloc.pool_begin; extend_global_pool(g); } mem = (struct free_list *)g->frame_malloc.pool_begin; g->frame_malloc.pool_begin += size; } return mem; } static void global_free(global_state_t *g, void *mem, int bucket) { size_t size; CILK_ASSERT(bucket < FRAME_MALLOC_NBUCKETS); size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); g->frame_malloc.allocated_from_global_pool -= size; push(&g->frame_malloc.global_free_list[bucket], mem); } void __cilkrts_frame_malloc_global_init(global_state_t *g) { int i; __cilkrts_mutex_init(&g->frame_malloc.lock); g->frame_malloc.check_for_leaks = 1; g->frame_malloc.pool_list = 0; g->frame_malloc.pool_begin = 0; g->frame_malloc.pool_end = 0; g->frame_malloc.batch_size = 8000; g->frame_malloc.potential_limit = 4 * g->frame_malloc.batch_size; g->frame_malloc.allocated_from_os = 0; g->frame_malloc.allocated_from_global_pool = 0; g->frame_malloc.wasted = 0; for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) g->frame_malloc.global_free_list[i] = 0; } // Counts how many bytes are in the global free list. static size_t count_memory_in_global_list(global_state_t *g) { // Count the memory remaining in the global free list. size_t size_remaining_in_global_list = 0; int i; for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) { struct free_list *p; size_t size_in_bucket = 0; p = g->frame_malloc.global_free_list[i]; while (p) { size_in_bucket += FRAME_MALLOC_BUCKET_TO_SIZE(i); p = p->cdr; } size_remaining_in_global_list += size_in_bucket; } return size_remaining_in_global_list; } void __cilkrts_frame_malloc_global_cleanup(global_state_t *g) { struct pool_cons *c; if (g->frame_malloc.check_for_leaks) { size_t memory_in_global_list = count_memory_in_global_list(g); // TBD: This check is weak. Short of memory corruption, // I don't see how we have more memory in the free list // than allocated from the os. // Ideally, we should count the memory in the global free list // and check that we have it all. But I believe the runtime // itself also uses some memory, which is not being tracked. if (memory_in_global_list > g->frame_malloc.allocated_from_os) { __cilkrts_bug("\nError. The Cilk runtime data structures may have been corrupted.\n"); } } while ((c = g->frame_malloc.pool_list)) { g->frame_malloc.pool_list = c->cdr; __cilkrts_free(c->p); __cilkrts_free(c); } __cilkrts_mutex_destroy(0, &g->frame_malloc.lock); // Check that all the memory moved from the global pool into // workers has been returned to the global pool. if (g->frame_malloc.check_for_leaks && (g->frame_malloc.allocated_from_global_pool != 0)) { __cilkrts_bug("\n" "---------------------------" "\n" " MEMORY LEAK DETECTED!!! " "\n" "---------------------------" "\n" "\n" ); } } /************************************************************* per-worker allocator *************************************************************/ /* allocate a batch of frames of size SIZE from the global pool and store them in the worker's free list */ static void allocate_batch(__cilkrts_worker *w, int bucket, size_t size) { global_state_t *g = w->g; __cilkrts_mutex_lock(w, &g->frame_malloc.lock); { #if USE_MMAP char *p = mmap(0, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (p == MAP_FAILED) __cilkrts_bug("mmap failed %d", errno); assert(size < 4096); assert(p != MAP_FAILED); mprotect(p, 4096, PROT_NONE); mprotect(p + 8192, 4096, PROT_NONE); w->l->bucket_potential[bucket] += size; push(&w->l->free_list[bucket], (struct free_list *)(p + 8192 - size)); #else size_t bytes_allocated = 0; do { w->l->bucket_potential[bucket] += size; bytes_allocated += size; push(&w->l->free_list[bucket], global_alloc(g, bucket)); } while (bytes_allocated < g->frame_malloc.batch_size); #endif } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock); } static void gc_bucket(__cilkrts_worker *w, int bucket, size_t size) { struct free_list *p, *q; global_state_t *g = w->g; size_t pot = w->l->bucket_potential[bucket]; size_t newpot; /* Keep up to POT/2 elements in the free list. The cost of counting up to POT/2 is amortized against POT. */ newpot = 0; for (newpot = 0, p = w->l->free_list[bucket]; p && 2 * newpot < pot; p = p->cdr, newpot += size) ; w->l->bucket_potential[bucket] = newpot; if (p) { /* free the rest of the list. The cost of grabbing the lock is amortized against POT/2; the cost of traversing the rest of the list is amortized against the free operation that puts the element on the list. */ __cilkrts_mutex_lock(w, &g->frame_malloc.lock); { while ((q = pop(&p->cdr))) #if USE_MMAP munmap((char *)q + size - 8192, 12288); #else global_free(g, q, bucket); #endif } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock); } } // Free all the memory in this bucket for the specified worker, // returning it to the global pool's free list. static void move_bucket_to_global_free_list(__cilkrts_worker *w, int bucket) { struct free_list *p, *q; global_state_t *g = w->g; p = w->l->free_list[bucket]; if (p) { __cilkrts_mutex_lock(w, &g->frame_malloc.lock); { while ((q = pop(&p))) { #if USE_MMAP size_t size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); munmap((char *)q + size - 8192, 12288); #else global_free(g, q, bucket); #endif } } __cilkrts_mutex_unlock(w, &g->frame_malloc.lock); } // I'm not sure this does anything useful now, since // the worker is about to be destroyed. But why not? w->l->bucket_potential[bucket] = 0; } static int bucket_of_size(size_t size) { int i; for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) if (size <= FRAME_MALLOC_BUCKET_TO_SIZE(i)) return i; CILK_ASSERT(0 /* can't happen */); return -1; } size_t __cilkrts_frame_malloc_roundup(size_t size) { if (size > FRAME_MALLOC_MAX_SIZE) { /* nothing, leave it alone */ } else { int bucket = bucket_of_size(size); size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); } return size; } size_t __cilkrts_size_of_bucket(int bucket) { CILK_ASSERT(bucket >= 0 && bucket < FRAME_MALLOC_NBUCKETS); return FRAME_MALLOC_BUCKET_TO_SIZE(bucket); } void *__cilkrts_frame_malloc(__cilkrts_worker *w, size_t size) { int bucket; void *mem; /* if too large, or if no worker, fall back to __cilkrts_malloc() */ if (!w || size > FRAME_MALLOC_MAX_SIZE) { NOTE_INTERVAL(w, INTERVAL_FRAME_ALLOC_LARGE); return __cilkrts_malloc(size); } START_INTERVAL(w, INTERVAL_FRAME_ALLOC); { bucket = bucket_of_size(size); size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); while (!(mem = pop(&w->l->free_list[bucket]))) { /* get a batch of frames from the global pool */ START_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL) { allocate_batch(w, bucket, size); } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC_GLOBAL); } } STOP_INTERVAL(w, INTERVAL_FRAME_ALLOC); return mem; } void __cilkrts_frame_free(__cilkrts_worker *w, void *p0, size_t size) { int bucket; struct free_list *p = (struct free_list *)p0; /* if too large, or if no worker, fall back to __cilkrts_free() */ if (!w || size > FRAME_MALLOC_MAX_SIZE) { NOTE_INTERVAL(w, INTERVAL_FRAME_FREE_LARGE); __cilkrts_free(p); return; } #if CILK_LIB_DEBUG *(volatile long *)w; #endif START_INTERVAL(w, INTERVAL_FRAME_FREE); { bucket = bucket_of_size(size); size = FRAME_MALLOC_BUCKET_TO_SIZE(bucket); w->l->bucket_potential[bucket] += size; push(&w->l->free_list[bucket], p); if (w->l->bucket_potential[bucket] > w->g->frame_malloc.potential_limit) { START_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL) { gc_bucket(w, bucket, size); } STOP_INTERVAL(w, INTERVAL_FRAME_FREE_GLOBAL); } } STOP_INTERVAL(w, INTERVAL_FRAME_FREE); } void __cilkrts_frame_malloc_per_worker_init(__cilkrts_worker *w) { int i; local_state *l = w->l; for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) { l->free_list[i] = 0; l->bucket_potential[i] = 0; } } void __cilkrts_frame_malloc_per_worker_cleanup(__cilkrts_worker *w) { int i; // Move memory to the global pool. This operation // ensures the memory does not become unreachable / leak // when the worker is destroyed. for (i = 0; i < FRAME_MALLOC_NBUCKETS; ++i) { move_bucket_to_global_free_list(w, i); } } /* Local Variables: ** c-file-style:"bsd" ** c-basic-offset:4 ** indent-tabs-mode:nil ** End: ** */