/* Test and benchmark of a couple of parallel sorting algorithms. Copyright (C) 2008 Free Software Foundation, Inc. GCC is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 3, or (at your option) any later version. GCC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GCC; see the file COPYING3. If not see . */ #include #include #include #include #include #include int failures; #define THRESHOLD 100 static void verify (const char *name, double stime, int *array, int count) { int i; double etime = omp_get_wtime (); printf ("%s: %g\n", name, etime - stime); for (i = 1; i < count; i++) if (array[i] < array[i - 1]) { printf ("%s: incorrectly sorted\n", name); failures = 1; } } static void insertsort (int *array, int s, int e) { int i, j, val; for (i = s + 1; i <= e; i++) { val = array[i]; j = i; while (j-- > s && val < array[j]) array[j + 1] = array[j]; array[j + 1] = val; } } struct int_pair { int lo; int hi; }; struct int_pair_stack { struct int_pair *top; #define STACK_SIZE 4 * CHAR_BIT * sizeof (int) struct int_pair arr[STACK_SIZE]; }; static inline void init_int_pair_stack (struct int_pair_stack *stack) { stack->top = &stack->arr[0]; } static inline void push_int_pair_stack (struct int_pair_stack *stack, int lo, int hi) { stack->top->lo = lo; stack->top->hi = hi; stack->top++; } static inline void pop_int_pair_stack (struct int_pair_stack *stack, int *lo, int *hi) { stack->top--; *lo = stack->top->lo; *hi = stack->top->hi; } static inline int size_int_pair_stack (struct int_pair_stack *stack) { return stack->top - &stack->arr[0]; } static inline void busy_wait (void) { #if defined __i386__ || defined __x86_64__ __asm volatile ("rep; nop" : : : "memory"); #elif defined __ia64__ __asm volatile ("hint @pause" : : : "memory"); #elif defined __sparc__ && (defined __arch64__ || defined __sparc_v9__) __asm volatile ("membar #LoadLoad" : : : "memory"); #else __asm volatile ("" : : : "memory"); #endif } static inline void swap (int *array, int a, int b) { int val = array[a]; array[a] = array[b]; array[b] = val; } static inline int choose_pivot (int *array, int lo, int hi) { int mid = (lo + hi) / 2; if (array[mid] < array[lo]) swap (array, lo, mid); if (array[hi] < array[mid]) { swap (array, mid, hi); if (array[mid] < array[lo]) swap (array, lo, mid); } return array[mid]; } static inline int partition (int *array, int lo, int hi) { int pivot = choose_pivot (array, lo, hi); int left = lo; int right = hi; for (;;) { while (array[++left] < pivot); while (array[--right] > pivot); if (left >= right) break; swap (array, left, right); } return left; } static void sort1 (int *array, int count) { omp_lock_t lock; struct int_pair_stack global_stack; int busy = 1; int num_threads; omp_init_lock (&lock); init_int_pair_stack (&global_stack); #pragma omp parallel firstprivate (array, count) { int lo = 0, hi = 0, mid, next_lo, next_hi; bool idle = true; struct int_pair_stack local_stack; init_int_pair_stack (&local_stack); if (omp_get_thread_num () == 0) { num_threads = omp_get_num_threads (); hi = count - 1; idle = false; } for (;;) { if (hi - lo < THRESHOLD) { insertsort (array, lo, hi); lo = hi; } if (lo >= hi) { if (size_int_pair_stack (&local_stack) == 0) { again: omp_set_lock (&lock); if (size_int_pair_stack (&global_stack) == 0) { if (!idle) busy--; if (busy == 0) { omp_unset_lock (&lock); break; } omp_unset_lock (&lock); idle = true; while (size_int_pair_stack (&global_stack) == 0 && busy) busy_wait (); goto again; } if (idle) busy++; pop_int_pair_stack (&global_stack, &lo, &hi); omp_unset_lock (&lock); idle = false; } else pop_int_pair_stack (&local_stack, &lo, &hi); } mid = partition (array, lo, hi); if (mid - lo < hi - mid) { next_lo = mid; next_hi = hi; hi = mid - 1; } else { next_lo = lo; next_hi = mid - 1; lo = mid; } if (next_hi - next_lo < THRESHOLD) insertsort (array, next_lo, next_hi); else { if (size_int_pair_stack (&global_stack) < num_threads - 1) { int size; omp_set_lock (&lock); size = size_int_pair_stack (&global_stack); if (size < num_threads - 1 && size < STACK_SIZE) push_int_pair_stack (&global_stack, next_lo, next_hi); else push_int_pair_stack (&local_stack, next_lo, next_hi); omp_unset_lock (&lock); } else push_int_pair_stack (&local_stack, next_lo, next_hi); } } } omp_destroy_lock (&lock); } static void sort2_1 (int *array, int lo, int hi, int num_threads, int *busy) { int mid; if (hi - lo < THRESHOLD) { insertsort (array, lo, hi); return; } mid = partition (array, lo, hi); if (*busy >= num_threads) { sort2_1 (array, lo, mid - 1, num_threads, busy); sort2_1 (array, mid, hi, num_threads, busy); return; } #pragma omp atomic *busy += 1; #pragma omp parallel num_threads (2) \ firstprivate (array, lo, hi, mid, num_threads, busy) { if (omp_get_thread_num () == 0) sort2_1 (array, lo, mid - 1, num_threads, busy); else { sort2_1 (array, mid, hi, num_threads, busy); #pragma omp atomic *busy -= 1; } } } static void sort2 (int *array, int count) { int num_threads; int busy = 1; #pragma omp parallel #pragma omp single nowait num_threads = omp_get_num_threads (); sort2_1 (array, 0, count - 1, num_threads, &busy); } #if _OPENMP >= 200805 static void sort3_1 (int *array, int lo, int hi) { int mid; if (hi - lo < THRESHOLD) { insertsort (array, lo, hi); return; } mid = partition (array, lo, hi); #pragma omp task sort3_1 (array, lo, mid - 1); sort3_1 (array, mid, hi); } static void sort3 (int *array, int count) { #pragma omp parallel #pragma omp single sort3_1 (array, 0, count - 1); } #endif int main (int argc, char **argv) { int i, count = 1000000; double stime; int *unsorted, *sorted, num_threads; if (argc >= 2) count = strtoul (argv[1], NULL, 0); unsorted = malloc (count * sizeof (int)); sorted = malloc (count * sizeof (int)); if (unsorted == NULL || sorted == NULL) { puts ("allocation failure"); exit (1); } srand (0xdeadbeef); for (i = 0; i < count; i++) unsorted[i] = rand (); omp_set_nested (1); omp_set_dynamic (0); #pragma omp parallel #pragma omp single nowait num_threads = omp_get_num_threads (); printf ("Threads: %d\n", num_threads); memcpy (sorted, unsorted, count * sizeof (int)); stime = omp_get_wtime (); sort1 (sorted, count); verify ("sort1", stime, sorted, count); memcpy (sorted, unsorted, count * sizeof (int)); stime = omp_get_wtime (); sort2 (sorted, count); verify ("sort2", stime, sorted, count); #if _OPENMP >= 200805 memcpy (sorted, unsorted, count * sizeof (int)); stime = omp_get_wtime (); sort3 (sorted, count); verify ("sort3", stime, sorted, count); #endif return 0; }