/* Cilk_abi.c -*-C++-*- * ************************************************************************* * * @copyright * Copyright (C) 2010-2013, Intel Corporation * All rights reserved. * * @copyright * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of Intel Corporation nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * @copyright * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * **************************************************************************/ /** * @file cilk-abi.c * * @brief cilk-abi.c implements all of the entrypoints to the Intel Cilk * Plus runtime. */ /* * Define this macro so that compiliation of this file generates the * non-inlined versions of certain functions in cilk_api.h. */ #include "internal/abi.h" #include "cilk/cilk_api.h" #include "cilk/cilk_undocumented.h" #include "cilktools/cilkscreen.h" #include "global_state.h" #include "os.h" #include "os_mutex.h" #include "bug.h" #include "local_state.h" #include "full_frame.h" #include "pedigrees.h" #include "scheduler.h" #include "sysdep.h" #include "except.h" #include "cilk_malloc.h" #include "record-replay.h" #include #include #include #ifdef _MSC_VER /* Some versions of icc don't support limits.h on Linux if gcc 4.3 or newer is installed. */ #include /* Declare _ReturnAddress compiler intrinsic */ void * _ReturnAddress(void); #pragma intrinsic(_ReturnAddress) #include "sysdep-win.h" // Needed for sysdep_init_module() #endif /* _WIN32 */ #include "metacall_impl.h" #include "reducer_impl.h" #include "cilk-ittnotify.h" #include "cilk-tbb-interop.h" #define TBB_INTEROP_DATA_DELAYED_UNTIL_BIND (void *)-1 /** * __cilkrts_bind_thread is a versioned entrypoint. The runtime should be * exporting copies of __cilkrts_bind_version for the current and all previous * versions of the ABI. * * This macro should always be set to generate a version to match the current * version; __CILKRTS_ABI_VERSION. */ #define BIND_THREAD_RTN __cilkrts_bind_thread_1 static inline void enter_frame_internal(__cilkrts_stack_frame *sf, uint32_t version) { __cilkrts_worker *w = __cilkrts_get_tls_worker(); if (w == 0) { /* slow path */ w = BIND_THREAD_RTN(); sf->flags = CILK_FRAME_LAST | (version << 24); CILK_ASSERT((sf->flags & CILK_FRAME_FLAGS_MASK) == CILK_FRAME_LAST); } else { sf->flags = (version << 24); CILK_ASSERT((sf->flags & CILK_FRAME_FLAGS_MASK) == 0); } sf->call_parent = w->current_stack_frame; sf->worker = w; w->current_stack_frame = sf; } CILK_ABI_VOID __cilkrts_enter_frame(__cilkrts_stack_frame *sf) { enter_frame_internal(sf, 0); } CILK_ABI_VOID __cilkrts_enter_frame_1(__cilkrts_stack_frame *sf) { enter_frame_internal(sf, 1); sf->reserved = 0; } static inline void enter_frame_fast_internal(__cilkrts_stack_frame *sf, uint32_t version) { __cilkrts_worker *w = __cilkrts_get_tls_worker_fast(); sf->flags = version << 24; sf->call_parent = w->current_stack_frame; sf->worker = w; w->current_stack_frame = sf; } CILK_ABI_VOID __cilkrts_enter_frame_fast(__cilkrts_stack_frame *sf) { enter_frame_fast_internal(sf, 0); } CILK_ABI_VOID __cilkrts_enter_frame_fast_1(__cilkrts_stack_frame *sf) { enter_frame_fast_internal(sf, 1); sf->reserved = 0; } /** * A component of the THE protocol. __cilkrts_undo_detach checks whether * this frame's parent has been stolen. If it hasn't, the frame can return * normally. If the parent has been stolen, of if we suspect it might be, * then __cilkrts_leave_frame() needs to call into the runtime. * * @note __cilkrts_undo_detach() is comparing the exception pointer against * the tail pointer. The exception pointer is modified when another worker * is considering whether it can steal a frame. The head pointer is updated * to match when the worker lock is taken out and the thief is sure that * it can complete the steal. If the steal cannot be completed, the thief * will restore the exception pointer. * * @return true if undo-detach failed. */ static int __cilkrts_undo_detach(__cilkrts_stack_frame *sf) { __cilkrts_worker *w = sf->worker; __cilkrts_stack_frame *volatile *t = w->tail; /* DBGPRINTF("%d - __cilkrts_undo_detach - sf %p\n", w->self, sf); */ --t; w->tail = t; /* On x86 the __sync_fetch_and_ family includes a full memory barrier. In theory the sequence in the second branch of the #if should be faster, but on most x86 it is not. */ #if defined __i386__ || defined __x86_64__ __sync_fetch_and_and(&sf->flags, ~CILK_FRAME_DETACHED); #else __cilkrts_fence(); /* membar #StoreLoad */ sf->flags &= ~CILK_FRAME_DETACHED; #endif return __builtin_expect(t < w->exc, 0); } CILK_ABI_VOID __cilkrts_leave_frame(__cilkrts_stack_frame *sf) { __cilkrts_worker *w = sf->worker; /* DBGPRINTF("%d-%p __cilkrts_leave_frame - sf %p, flags: %x\n", w->self, GetWorkerFiber(w), sf, sf->flags); */ #ifdef _WIN32 /* if leave frame was called from our unwind handler, leave_frame should proceed no further. */ if (sf->flags & CILK_FRAME_UNWINDING) { /* DBGPRINTF("%d - __cilkrts_leave_frame - aborting due to UNWINDING flag\n", w->self); */ // If this is the frame of a spawn helper (indicated by the // CILK_FRAME_DETACHED flag) we must update the pedigree. The pedigree // points to nodes allocated on the stack. Failing to update it will // result in a accvio/segfault if the pedigree is walked. This must happen // for all spawn helper frames, even if we're processing an exception if ((sf->flags & CILK_FRAME_DETACHED)) { update_pedigree_on_leave_frame(w, sf); } return; } #endif #if CILK_LIB_DEBUG /* ensure the caller popped itself */ CILK_ASSERT(w->current_stack_frame != sf); #endif /* The exiting function should have checked for zero flags, so there is no check for flags == 0 here. */ #if CILK_LIB_DEBUG if (__builtin_expect(sf->flags & (CILK_FRAME_EXITING|CILK_FRAME_UNSYNCHED), 0)) __cilkrts_bug("W%u: function exiting with invalid flags %02x\n", w->self, sf->flags); #endif /* Must return normally if (1) the active function was called and not spawned, or (2) the parent has never been stolen. */ if ((sf->flags & CILK_FRAME_DETACHED)) { /* DBGPRINTF("%d - __cilkrts_leave_frame - CILK_FRAME_DETACHED\n", w->self); */ #ifndef _WIN32 if (__builtin_expect(sf->flags & CILK_FRAME_EXCEPTING, 0)) { // Pedigree will be updated in __cilkrts_leave_frame. We need the // pedigree before the update for record/replay // update_pedigree_on_leave_frame(w, sf); __cilkrts_return_exception(sf); /* If return_exception returns the caller is attached. leave_frame is called from a cleanup (destructor) for the frame object. The caller will reraise the exception. */ return; } #endif // During replay, check whether w was the last worker to continue replay_wait_for_steal_if_parent_was_stolen(w); // Attempt to undo the detach if (__builtin_expect(__cilkrts_undo_detach(sf), 0)) { // The update of pedigree for leaving the frame occurs // inside this call if it does not return. __cilkrts_c_THE_exception_check(w, sf); } update_pedigree_on_leave_frame(w, sf); /* This path is taken when undo-detach wins the race with stealing. Otherwise this strand terminates and the caller will be resumed via setjmp at sync. */ if (__builtin_expect(sf->flags & CILK_FRAME_FLAGS_MASK, 0)) __cilkrts_bug("W%u: frame won undo-detach race with flags %02x\n", w->self, sf->flags); return; } #if CILK_LIB_DEBUG sf->flags |= CILK_FRAME_EXITING; #endif if (__builtin_expect(sf->flags & CILK_FRAME_LAST, 0)) __cilkrts_c_return_from_initial(w); /* does return */ else if (sf->flags & CILK_FRAME_STOLEN) __cilkrts_return(w); /* does return */ /* DBGPRINTF("%d-%p __cilkrts_leave_frame - returning, StackBase: %p\n", w->self, GetWorkerFiber(w)); */ } /* Caller must have called setjmp. */ CILK_ABI_VOID __cilkrts_sync(__cilkrts_stack_frame *sf) { __cilkrts_worker *w = sf->worker; /* DBGPRINTF("%d-%p __cilkrts_sync - sf %p\n", w->self, GetWorkerFiber(w), sf); */ if (__builtin_expect(!(sf->flags & CILK_FRAME_UNSYNCHED), 0)) __cilkrts_bug("W%u: double sync %p\n", w->self, sf); #ifndef _WIN32 if (__builtin_expect(sf->flags & CILK_FRAME_EXCEPTING, 0)) { __cilkrts_c_sync_except(w, sf); } #endif __cilkrts_c_sync(w, sf); } /* * __cilkrts_get_sf * * Debugging aid to provide access to the current __cilkrts_stack_frame. * * Not documented! */ CILK_API_VOID_PTR __cilkrts_get_sf(void) { __cilkrts_worker *w = __cilkrts_get_tls_worker(); if (0 == w) return NULL; return w->current_stack_frame; } /* Call with global lock held */ static __cilkrts_worker *find_free_worker(global_state_t *g) { __cilkrts_worker *w = 0; int i; // Scan the non-system workers looking for one which is free so we can // use it. for (i = g->P - 1; i < g->total_workers; ++i) { w = g->workers[i]; CILK_ASSERT(WORKER_SYSTEM != w->l->type); if (w->l->type == WORKER_FREE) { w->l->type = WORKER_USER; w->l->team = w; return w; } } // If we ran out of workers, create a new one. It doesn't actually belong // to the Cilk global state so nobody will ever try to steal from it. w = (__cilkrts_worker *)__cilkrts_malloc(sizeof(*w)); __cilkrts_cilkscreen_ignore_block(w, w+1); make_worker(g, -1, w); w->l->type = WORKER_USER; w->l->team = w; return w; } /* * __cilkrts_bind_thread * * Exported function to bind a thread to the runtime. * * This function name should always have a trailing suffix for the latest ABI * version. This means that code built with a new compiler will not load * against an old copy of the runtime. * * Symbols for the function called by code compiled with old versions of the * compiler are created in an OS-specific manner: * - On Windows the old symbols are defined in the cilk-exports.def linker * definitions file as aliases of BIND_THREAD_RTN * - On Linux aliased symbols are created for BIND_THREAD_RTN in this file * - On MacOS the alternate entrypoints are implemented and simply call * BIND_THREAD_RTN. */ CILK_ABI_WORKER_PTR BIND_THREAD_RTN(void) { __cilkrts_worker *w; int start_cilkscreen = 0; #ifdef USE_ITTNOTIFY static int unique_obj; #endif // Cannot set this pointer until after __cilkrts_init_internal() call: global_state_t* g; ITT_SYNC_CREATE (&unique_obj, "Initialization"); ITT_SYNC_PREPARE(&unique_obj); ITT_SYNC_ACQUIRED(&unique_obj); /* 1: Initialize and start the Cilk runtime */ __cilkrts_init_internal(1); /* * 2: Choose a worker for this thread (fail if none left). The table of * user workers is protected by the global OS mutex lock. */ g = cilkg_get_global_state(); global_os_mutex_lock(); if (__builtin_expect(g->work_done, 0)) __cilkrts_bug("Attempt to enter Cilk while Cilk is shutting down"); w = find_free_worker(g); CILK_ASSERT(w); __cilkrts_set_tls_worker(w); __cilkrts_cilkscreen_establish_worker(w); { full_frame *ff = __cilkrts_make_full_frame(w, 0); ff->fiber_self = cilk_fiber_allocate_from_thread(); CILK_ASSERT(ff->fiber_self); cilk_fiber_set_owner(ff->fiber_self, w); cilk_fiber_tbb_interop_use_saved_stack_op_info(ff->fiber_self); CILK_ASSERT(ff->join_counter == 0); ff->join_counter = 1; w->l->frame_ff = ff; w->reducer_map = __cilkrts_make_reducer_map(w); __cilkrts_set_leftmost_reducer_map(w->reducer_map, 1); load_pedigree_leaf_into_user_worker(w); } // Make sure that the head and tail are reset, and saved_protected_tail // allows all frames to be stolen. // // Note that we must NOT check w->exc, since workers that are trying to // steal from it will be updating w->exc and we don't own the worker lock. // It's not worth taking out the lock just for an assertion. CILK_ASSERT(w->head == w->l->ltq); CILK_ASSERT(w->tail == w->l->ltq); CILK_ASSERT(w->protected_tail == w->ltq_limit); // There may have been an old pending exception which was freed when the // exception was caught outside of Cilk w->l->pending_exception = NULL; w->reserved = NULL; // If we've already created a scheduling fiber for this worker, we'll just // reuse it. If w->self < 0, it means that this is an ad-hoc user worker // not known to the global state. Thus, we need to create a scheduling // stack only if we don't already have one and w->self >= 0. if (NULL == w->l->scheduling_fiber && w->self >= 0) { START_INTERVAL(w, INTERVAL_FIBER_ALLOCATE) { // Create a scheduling fiber for this worker. w->l->scheduling_fiber = cilk_fiber_allocate_from_heap(CILK_SCHEDULING_STACK_SIZE); cilk_fiber_reset_state(w->l->scheduling_fiber, scheduler_fiber_proc_for_user_worker); cilk_fiber_set_owner(w->l->scheduling_fiber, w); } STOP_INTERVAL(w, INTERVAL_FIBER_ALLOCATE); } // If the scheduling fiber is NULL, we've either exceeded our quota for // fibers or workers or we're out of memory, so we should lose parallelism // by disallowing stealing. if (NULL == w->l->scheduling_fiber) __cilkrts_disallow_stealing(w, NULL); start_cilkscreen = (0 == w->g->Q); if (w->self != -1) { // w->self != -1, means that w is a normal user worker and must be // accounted for by the global state since other workers can steal from // it. // w->self == -1, means that w is an overflow worker and was created on // demand. I.e., it does not need to be accounted for by the global // state. __cilkrts_enter_cilk(w->g); } global_os_mutex_unlock(); /* If there's only 1 worker, the counts will be started in * __cilkrts_scheduler */ if (g->P > 1) { START_INTERVAL(w, INTERVAL_IN_SCHEDULER); START_INTERVAL(w, INTERVAL_WORKING); } ITT_SYNC_RELEASING(&unique_obj); /* Turn on Cilkscreen if this is the first worker. This needs to be done * when we are NOT holding the os mutex. */ if (start_cilkscreen) __cilkrts_cilkscreen_enable_instrumentation(); return w; } #ifndef _MSC_VER /* * Define old version-specific symbols for binding threads (since they exist in * all Cilk code). These aliases prohibit newly compiled code from loading an * old version of the runtime. We can handle old code with a new runtime, but * new code with an old runtime is verboten! * * For Windows, the aliased symbol is exported in cilk-exports.def. */ #if defined(_DARWIN_C_SOURCE) || defined(__APPLE__) /** * Mac OS X: Unfortunately, Darwin doesn't allow aliasing, so we just make a * call and hope the optimizer does the right thing. */ CILK_ABI_WORKER_PTR __cilkrts_bind_thread (void) { return BIND_THREAD_RTN(); } #else /** * Macro to convert a parameter to a string. Used on Linux or BSD. */ #define STRINGIFY(x) #x /** * Macro to generate an __attribute__ for an aliased name */ #define ALIASED_NAME(x) __attribute__ ((alias (STRINGIFY(x)))) /** * Linux or BSD: Use the alias attribute to make the labels for the versioned * functions point to the same place in the code as the original. Using * the two macros is annoying but required. */ CILK_ABI_WORKER_PTR __cilkrts_bind_thread(void) ALIASED_NAME(BIND_THREAD_RTN); #endif // defined _DARWIN_C_SOURCE || defined __APPLE__ #endif // !defined _MSC_VER CILK_API_SIZET __cilkrts_get_stack_size(void) { return cilkg_get_stack_size(); } // Method for debugging. CILK_API_VOID __cilkrts_dump_stats(void) { // While the stats aren't protected by the global OS mutex, the table // of workers is, so take out the global OS mutex while we're doing this global_os_mutex_lock(); if (cilkg_is_published()) { global_state_t *g = cilkg_get_global_state(); __cilkrts_dump_stats_to_stderr(g); } else { __cilkrts_bug("Attempting to report Cilk stats before the runtime has started\n"); } global_os_mutex_unlock(); } #ifndef _WIN32 CILK_ABI_THROWS_VOID __cilkrts_rethrow(__cilkrts_stack_frame *sf) { __cilkrts_gcc_rethrow(sf); } #endif /* * __cilkrts_unwatch_stack * * Callback for TBB to tell us they don't want to watch the stack anymore */ static __cilk_tbb_retcode __cilkrts_unwatch_stack(void *data) { __cilk_tbb_stack_op_thunk o; // If the cilk_fiber wasn't available fetch it now if (TBB_INTEROP_DATA_DELAYED_UNTIL_BIND == data) { full_frame *ff; __cilkrts_worker *w = __cilkrts_get_tls_worker(); if (NULL == w) { // Free any saved stack op information cilk_fiber_tbb_interop_free_stack_op_info(); return 0; /* Success! */ } __cilkrts_worker_lock(w); ff = w->l->frame_ff; __cilkrts_frame_lock(w,ff); data = ff->fiber_self; __cilkrts_frame_unlock(w,ff); __cilkrts_worker_unlock(w); } #if CILK_LIB_DEBUG /* Debug code */ /* Get current stack */ full_frame *ff; __cilkrts_worker *w = __cilkrts_get_tls_worker(); __cilkrts_worker_lock(w); ff = w->l->frame_ff; __cilkrts_frame_lock(w,ff); CILK_ASSERT (data == ff->fiber_self); __cilkrts_frame_unlock(w,ff); __cilkrts_worker_unlock(w); #endif /* Clear the callback information */ o.data = NULL; o.routine = NULL; cilk_fiber_set_stack_op((cilk_fiber*)data, o); // Note. Do *NOT* free any saved stack information here. If they want to // free the saved stack op information, they'll do it when the thread is // unbound return 0; /* Success! */ } /* * __cilkrts_watch_stack * * Called by TBB, defined by Cilk. * * Requests that Cilk invoke the stack op routine when it orphans a stack. * Cilk sets *u to a thunk that TBB should call when it is no longer interested * in watching the stack. */ CILK_API_TBB_RETCODE __cilkrts_watch_stack(__cilk_tbb_unwatch_thunk *u, __cilk_tbb_stack_op_thunk o) { cilk_fiber* current_fiber; __cilkrts_worker *w; #ifdef _MSC_VER // This may be called by TBB *before* the OS has given us our // initialization call. Make sure the module is initialized. sysdep_init_module(); #endif // Fetch the __cilkrts_worker bound to this thread w = __cilkrts_get_tls_worker(); if (NULL == w) { // Save data for later. We'll deal with it when/if this thread binds // to the runtime cilk_fiber_tbb_interop_save_stack_op_info(o); u->routine = __cilkrts_unwatch_stack; u->data = TBB_INTEROP_DATA_DELAYED_UNTIL_BIND; return 0; } /* Get current stack */ __cilkrts_worker_lock(w); current_fiber = w->l->frame_ff->fiber_self; __cilkrts_worker_unlock(w); /* CILK_ASSERT( !sd->stack_op_data ); */ /* CILK_ASSERT( !sd->stack_op_routine ); */ /* Give TBB our callback */ u->routine = __cilkrts_unwatch_stack; u->data = current_fiber; /* Save the callback information */ cilk_fiber_set_stack_op(current_fiber, o); return 0; /* Success! */ } // This function must be called only within a continuation, within the stack // frame of the continuation itself. CILK_API_INT __cilkrts_synched(void) { __cilkrts_worker *w = __cilkrts_get_tls_worker(); // If we don't have a worker, then we're synched by definition :o) if (NULL == w) return 1; // Check to see if we are in a stolen continuation. If not, then // we are synched. uint32_t flags = w->current_stack_frame->flags; if (0 == (flags & CILK_FRAME_UNSYNCHED)) return 1; // We are in a stolen continutation, but the join counter might have been // decremented to one, making us synched again. Get the full frame so // that we can check the join counter. ASSUME: frame_ff is stable (can be // read without a lock) in a stolen continuation -- it can't be stolen // while it's currently executing. full_frame *ff = w->l->frame_ff; // Make sure we have a full frame // TBD: Don't think that we should ever not have a full frame here. // CILK_ASSERT(NULL != ff); ? if (NULL == ff) return 1; // We're synched if there are no outstanding children at this instant in // time. Note that this is a known race, but it's ok since we're only // reading. We can get false negatives, but not false positives. (I.e., // we can read a non-one join_counter just before it goes to one, but the // join_counter cannot go from one to greater than one while we're // reading.) return 1 == ff->join_counter; } CILK_API_INT __cilkrts_bump_loop_rank_internal(__cilkrts_worker* w) { // If we don't have a worker, then the runtime is not bound to this // thread and there is no rank to increment if (NULL == w) return -1; // We're at the start of the loop body. Advance the cilk_for loop // body pedigree by following the parent link and updating its // rank. // Normally, we'd just write "w->pedigree.parent->rank++" // But we need to cast away the "const". ((__cilkrts_pedigree*) w->pedigree.parent)->rank++; // Zero the worker's pedigree rank since this is the start of a new // pedigree domain. w->pedigree.rank = 0; return 0; } CILK_ABI_VOID __cilkrts_save_fp_ctrl_state(__cilkrts_stack_frame *sf) { // Pass call onto OS/architecture dependent function sysdep_save_fp_ctrl_state(sf); } /* end cilk-abi.c */