aboutsummaryrefslogtreecommitdiffstats
path: root/gcc-4.9/libcilkrts/runtime/cilk-abi-cilk-for.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'gcc-4.9/libcilkrts/runtime/cilk-abi-cilk-for.cpp')
-rw-r--r--gcc-4.9/libcilkrts/runtime/cilk-abi-cilk-for.cpp416
1 files changed, 416 insertions, 0 deletions
diff --git a/gcc-4.9/libcilkrts/runtime/cilk-abi-cilk-for.cpp b/gcc-4.9/libcilkrts/runtime/cilk-abi-cilk-for.cpp
new file mode 100644
index 000000000..4cd04f521
--- /dev/null
+++ b/gcc-4.9/libcilkrts/runtime/cilk-abi-cilk-for.cpp
@@ -0,0 +1,416 @@
+/* cilk-abi-cilk-for.cpp -*-C++-*-
+ *
+ *************************************************************************
+ *
+ * @copyright
+ * Copyright (C) 2011, 2013, Intel Corporation
+ * All rights reserved.
+ *
+ * @copyright
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Intel Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * @copyright
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
+ * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************/
+
+/* Implementation of cilk_for ABI.
+ *
+ * This file must be C++, not C, in order to handle C++ exceptions correctly
+ * from within the body of the cilk_for loop
+ */
+
+#include "internal/abi.h"
+#include "metacall_impl.h"
+#include "global_state.h"
+
+// Icky macros to determine if we're compiled with optimization. Based on
+// the declaration of __CILKRTS_ASSERT in common.h
+#if defined(_WIN32)
+# if defined (_DEBUG)
+# define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od
+# else
+# define CILKRTS_OPTIMIZED 1
+# endif // defined(_DEBUG)
+#else
+# if defined(__OPTIMIZE__)
+# define CILKRTS_OPTIMIZED 1
+# else
+# define CILKRTS_OPTIMIZED 0
+# endif
+#endif
+
+template <typename count_t>
+static inline int grainsize(int req, count_t count)
+{
+ // A positive requested grain size comes from the user. A very high grain
+ // size risks losing parallelism, but the user told us what they want for
+ // grainsize. Who are we to argue?
+ if (req > 0)
+ return req;
+
+ // At present, a negative requested grain size is treated the same way as
+ // a zero grain size, i.e., the runtime computes the actual grainsize
+ // using a hueristic. In the future, the compiler may give us additional
+ // information about the size of the cilk_for body by passing a negative
+ // grain size.
+
+ // Avoid generating a zero grainsize, even for empty loops.
+ if (count < 1)
+ return 1;
+
+ global_state_t* g = cilkg_get_global_state();
+ if (g->under_ptool)
+ {
+ // Grainsize = 1, when running under PIN, and when the grainsize has
+ // not explicitly been set by the user.
+ return 1;
+ }
+ else
+ {
+ // Divide loop count by 8 times the worker count and round up.
+ const int Px8 = g->P * 8;
+ count_t n = (count + Px8 - 1) / Px8;
+
+ // 2K should be enough to amortize the cost of the cilk_for. Any
+ // larger grainsize risks losing parallelism.
+ if (n > 2048)
+ return 2048;
+ return (int) n; // n <= 2048, so no loss of precision on cast to int
+ }
+}
+
+/*
+ * call_cilk_for_loop_body
+ *
+ * Centralizes the code to call the loop body. The compiler should be
+ * inlining this code
+ *
+ * low - Low loop index we're considering in this portion of the algorithm
+ * high - High loop index we're considering in this portion of the algorithm
+ * body - lambda function for the cilk_for loop body
+ * data - data used by the lambda function
+ * w - __cilkrts_worker we're currently executing on
+ * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
+ * the cilk_for loop to flatten out the internal nodes
+ */
+template <typename count_t, typename F>
+inline static
+void call_cilk_for_loop_body(count_t low, count_t high,
+ F body, void *data,
+ __cilkrts_worker *w,
+ __cilkrts_pedigree *loop_root_pedigree)
+{
+ // Cilkscreen should not report this call in a stack trace
+ NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
+
+ // The worker is only valid until the first spawn. Fetch the
+ // __cilkrts_stack_frame out of the worker, since it will be stable across
+ // steals. The sf pointer actually points to the *parent's*
+ // __cilkrts_stack_frame, since this function is a non-spawning function
+ // and therefore has no cilk stack frame of its own.
+ __cilkrts_stack_frame *sf = w->current_stack_frame;
+
+ // Save the pedigree node pointed to by the worker. We'll need to restore
+ // that when we exit since the spawn helpers in the cilk_for call tree
+ // will assume that it's valid
+ const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;
+
+ // Add the leaf pedigree node to the chain. The parent is the root node
+ // to flatten the tree regardless of the DAG branches in the cilk_for
+ // divide-and-conquer recursion.
+ //
+ // The rank is initialized to the low index. The user is
+ // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
+ // loop body.
+ __cilkrts_pedigree loop_leaf_pedigree;
+
+ loop_leaf_pedigree.rank = (uint64_t)low;
+ loop_leaf_pedigree.parent = loop_root_pedigree;
+
+ // The worker's pedigree always starts with a rank of 0
+ w->pedigree.rank = 0;
+ w->pedigree.parent = &loop_leaf_pedigree;
+
+ // Call the compiler generated cilk_for loop body lambda function
+ body(data, low, high);
+
+ // The loop body may have included spawns, so we must refetch the worker
+ // from the __cilkrts_stack_frame, which is stable regardless of which
+ // worker we're executing on.
+ w = sf->worker;
+
+ // Restore the pedigree chain. It must be valid because the spawn helpers
+ // generated by the cilk_for implementation will access it.
+ w->pedigree.parent = saved_next_pedigree_node;
+}
+
+/* capture_spawn_arg_stack_frame
+ *
+ * Efficiently get the address of the caller's __cilkrts_stack_frame. The
+ * preconditons are that 'w' is the worker at the time of the call and
+ * 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
+ * spawn helper. This function should be called only within the argument list
+ * of a function that is being spawned because that is the only situation in
+ * which these preconditions hold. This function returns the worker
+ * (unchanged) after storing the captured stack frame pointer is stored in the
+ * sf argument.
+ *
+ * The purpose of this function is to get the caller's stack frame in a
+ * context where the caller's worker is known but its stack frame is not
+ * necessarily initialized. The "shrink wrap" optimization delays
+ * initializing the contents of a spawning function's '__cilkrts_stack_frame'
+ * as well as the 'current_stack_frame' pointer within the worker. By calling
+ * this function within a spawning function's argument list, we can ensure
+ * that these initializations have occured but that a detach (which would
+ * invalidate the worker pointer in the caller) has not yet occured. Once the
+ * '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
+ * remainder of the caller's execution, and becomes an efficient way to get
+ * the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
+ * even after a spawn or sync.
+ */
+inline __cilkrts_worker*
+capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
+{
+ // Get current stack frame
+ sf = w->current_stack_frame;
+#ifdef __INTEL_COMPILER
+# if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
+ // In older compilers 'w->current_stack_frame' points to the
+ // spawn-helper's stack frame. In newer compiler's however, it points
+ // directly to the pointer's stack frame. (This change was made to avoid
+ // having the spawn helper in the frame list when evaluating function
+ // arguments, thus avoiding corruption when those arguments themselves
+ // contain cilk_spawns.)
+
+ // w->current_stack_frame is the spawn helper's stack frame.
+ // w->current_stack_frame->call_parent is the caller's stack frame.
+ sf = sf->call_parent;
+# endif
+#endif
+ return w;
+}
+
+/*
+ * cilk_for_recursive
+ *
+ * Templatized function to implement the recursive divide-and-conquer
+ * algorithm that's how we implement a cilk_for.
+ *
+ * low - Low loop index we're considering in this portion of the algorithm
+ * high - High loop index we're considering in this portion of the algorithm
+ * body - lambda function for the cilk_for loop body
+ * data - data used by the lambda function
+ * grain - grain size (0 if it should be computed)
+ * w - __cilkrts_worker we're currently executing on
+ * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
+ * the cilk_for loop to flatten out the internal nodes
+ */
+template <typename count_t, typename F>
+static
+void cilk_for_recursive(count_t low, count_t high,
+ F body, void *data, int grain,
+ __cilkrts_worker *w,
+ __cilkrts_pedigree *loop_root_pedigree)
+{
+tail_recurse:
+ // Cilkscreen should not report this call in a stack trace
+ // This needs to be done everytime the worker resumes
+ NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
+
+ count_t count = high - low;
+ // Invariant: count > 0, grain >= 1
+ if (count > grain)
+ {
+ // Invariant: count >= 2
+ count_t mid = low + count / 2;
+ // The worker is valid only until the first spawn and is expensive to
+ // retrieve (using '__cilkrts_get_tls_worker') after the spawn. The
+ // '__cilkrts_stack_frame' is more stable, but isn't initialized until
+ // the first spawn. Thus, we want to grab the address of the
+ // '__cilkrts_stack_frame' after it is initialized but before the
+ // spawn detaches. The only place we can do that is within the
+ // argument list of the spawned function, hence the call to
+ // capture_spawn_arg_stack_frame().
+ __cilkrts_stack_frame *sf;
+#if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
+ // The current version of gcc initializes the sf structure eagerly.
+ // We can take advantage of this fact to avoid calling
+ // `capture_spawn_arg_stack_frame` when compiling with gcc.
+ // Remove this if the "shrink-wrap" optimization is implemented.
+ sf = w->current_stack_frame;
+ _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
+ loop_root_pedigree);
+#else
+ _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
+ capture_spawn_arg_stack_frame(sf, w),
+ loop_root_pedigree);
+#endif
+ w = sf->worker;
+ low = mid;
+
+ goto tail_recurse;
+ }
+
+ // Call the cilk_for loop body lambda function passed in by the compiler to
+ // execute one grain
+ call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
+}
+
+static void noop() { }
+
+/*
+ * cilk_for_root
+ *
+ * Templatized function to implement the top level of a cilk_for loop.
+ *
+ * body - lambda function for the cilk_for loop body
+ * data - data used by the lambda function
+ * count - trip count for loop
+ * grain - grain size (0 if it should be computed)
+ */
+template <typename count_t, typename F>
+static void cilk_for_root(F body, void *data, count_t count, int grain)
+{
+ // Cilkscreen should not report this call in a stack trace
+ NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
+
+ // Pedigree computation:
+ //
+ // If the last pedigree node on entry to the _Cilk_for has value X,
+ // then at the start of each iteration of the loop body, the value of
+ // the last pedigree node should be 0, the value of the second-to-last
+ // node should equal the loop counter, and the value of the
+ // third-to-last node should be X. On return from the _Cilk_for, the
+ // value of the last pedigree should be incremented to X+2. The
+ // pedigree within the loop is thus flattened, such that the depth of
+ // recursion does not affect the results either inside or outside of
+ // the loop. Note that the pedigree after the loop exists is the same
+ // as if a single spawn and sync were executed within this function.
+
+ // TBD: Since the shrink-wrap optimization was turned on in the compiler,
+ // it is not possible to get the current stack frame without actually
+ // forcing a call to bind-thread. This spurious spawn is a temporary
+ // stopgap until the correct intrinsics are added to give us total control
+ // over frame initialization.
+ _Cilk_spawn noop();
+
+ // Fetch the current worker. From that we can get the current stack frame
+ // which will be constant even if we're stolen
+ __cilkrts_worker *w = __cilkrts_get_tls_worker();
+ __cilkrts_stack_frame *sf = w->current_stack_frame;
+
+ // Decrement the rank by one to undo the pedigree change from the
+ // _Cilk_spawn
+ --w->pedigree.rank;
+
+ // Save the current worker pedigree into loop_root_pedigree, which will be
+ // the root node for our flattened pedigree.
+ __cilkrts_pedigree loop_root_pedigree = w->pedigree;
+
+ // Don't splice the loop_root node in yet. It will be done when we
+ // call the loop body lambda function
+// w->pedigree.rank = 0;
+// w->pedigree.next = &loop_root_pedigree;
+
+ /* Spawn is necessary at top-level to force runtime to start up.
+ * Runtime must be started in order to call the grainsize() function.
+ */
+ int gs = grainsize(grain, count);
+ cilk_for_recursive((count_t) 0, count, body, data, gs, w,
+ &loop_root_pedigree);
+
+ // Need to refetch the worker after calling a spawning function.
+ w = sf->worker;
+
+ // Restore the pedigree in the worker.
+ w->pedigree = loop_root_pedigree;
+
+ // Bump the worker pedigree.
+ ++w->pedigree.rank;
+
+ // Implicit sync will increment the pedigree leaf rank again, for a total
+ // of two increments. If the noop spawn above is removed, then we'll need
+ // to re-enable the following code:
+// // If this is an optimized build, then the compiler will have optimized
+// // out the increment of the worker's pedigree in the implied sync. We
+// // need to add one to make the pedigree_loop test work correctly.
+// #if CILKRTS_OPTIMIZED
+// ++sf->worker->pedigree.rank;
+// #endif
+}
+
+// Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
+// __cilkrts_cilk_for_64.
+extern "C" {
+
+/*
+ * __cilkrts_cilk_for_32
+ *
+ * Implementation of cilk_for for 32-bit trip counts (regardless of processor
+ * word size). Assumes that the range is 0 - count.
+ *
+ * body - lambda function for the cilk_for loop body
+ * data - data used by the lambda function
+ * count - trip count for loop
+ * grain - grain size (0 if it should be computed)
+ */
+
+CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
+ cilk32_t count, int grain)
+{
+ // Cilkscreen should not report this call in a stack trace
+ NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
+
+ // Check for an empty range here as an optimization - don't need to do any
+ // __cilkrts_stack_frame initialization
+ if (count > 0)
+ cilk_for_root(body, data, count, grain);
+}
+
+/*
+ * __cilkrts_cilk_for_64
+ *
+ * Implementation of cilk_for for 64-bit trip counts (regardless of processor
+ * word size). Assumes that the range is 0 - count.
+ *
+ * body - lambda function for the cilk_for loop body
+ * data - data used by the lambda function
+ * count - trip count for loop
+ * grain - grain size (0 if it should be computed)
+ */
+CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
+ cilk64_t count, int grain)
+{
+ // Check for an empty range here as an optimization - don't need to do any
+ // __cilkrts_stack_frame initialization
+ if (count > 0)
+ cilk_for_root(body, data, count, grain);
+}
+
+} // end extern "C"
+
+/* End cilk-abi-cilk-for.cpp */