diff options
author | Christophe Lyon <christophe.lyon@st.com> | 2013-01-17 17:23:11 +0100 |
---|---|---|
committer | Christophe Lyon <christophe.lyon@st.com> | 2013-01-17 17:51:15 +0100 |
commit | 01af0a532c91523692d7b32ed08e0aace8123cba (patch) | |
tree | 199360c2ab91c91a7c1382c676982a09ea251ef2 | |
parent | 302cedf48e55284564aae5256adfdebb1a7a6a0c (diff) | |
download | platform_external_arm-neon-tests-01af0a532c91523692d7b32ed08e0aace8123cba.tar.gz platform_external_arm-neon-tests-01af0a532c91523692d7b32ed08e0aace8123cba.tar.bz2 platform_external_arm-neon-tests-01af0a532c91523692d7b32ed08e0aace8123cba.zip |
Reorganize input data buffer to try to detect some out of bounds accesses.
Add dedicated input for vdup, vld1_dup, vldX, vldX_dup, vldX_lane, vstX_lane.
-rw-r--r-- | compute_ref.axf | bin | 3246928 -> 3175368 bytes | |||
-rw-r--r-- | compute_ref.c | 3 | ||||
-rwxr-xr-x | compute_ref.gccarm | bin | 4291760 -> 3967245 bytes | |||
-rw-r--r-- | compute_ref.gccarm-rvct | bin | 4498672 -> 4438112 bytes | |||
-rw-r--r-- | compute_ref_data.c | 322 | ||||
-rw-r--r-- | ref-rvct-neon.txt | 118 | ||||
-rw-r--r-- | ref_vdup.c | 4 | ||||
-rw-r--r-- | ref_vld1_dup.c | 6 | ||||
-rw-r--r-- | ref_vldX.c | 3 | ||||
-rw-r--r-- | ref_vldX_dup.c | 3 | ||||
-rw-r--r-- | ref_vldX_lane.c | 11 | ||||
-rw-r--r-- | ref_vstX_lane.c | 3 | ||||
-rw-r--r-- | stm-arm-neon-ref.h | 176 |
13 files changed, 557 insertions, 92 deletions
diff --git a/compute_ref.axf b/compute_ref.axf Binary files differindex ab1ae89..c1e139e 100644 --- a/compute_ref.axf +++ b/compute_ref.axf diff --git a/compute_ref.c b/compute_ref.c index 47f51cb..6d4c1e9 100644 --- a/compute_ref.c +++ b/compute_ref.c @@ -187,6 +187,8 @@ extern void exec_dsp(void); /* DSP (non-NEON) intrinsics */ extern void exec_dspfns(void); /* DSP FNS (non-NEON/ITU) intrinsics */ #endif +#include "compute_ref_data.c" + int main (void) { #if defined(_MSC_VER) @@ -339,7 +341,6 @@ int main (void) exec_vrecpe (); exec_vrsqrte (); - exec_vcage (); exec_vcale (); exec_vcagt (); diff --git a/compute_ref.gccarm b/compute_ref.gccarm Binary files differindex b42db61..61a18d8 100755 --- a/compute_ref.gccarm +++ b/compute_ref.gccarm diff --git a/compute_ref.gccarm-rvct b/compute_ref.gccarm-rvct Binary files differindex 1bfc143..3587a97 100644 --- a/compute_ref.gccarm-rvct +++ b/compute_ref.gccarm-rvct diff --git a/compute_ref_data.c b/compute_ref_data.c new file mode 100644 index 0000000..48e8025 --- /dev/null +++ b/compute_ref_data.c @@ -0,0 +1,322 @@ + +#ifdef __arm__ +#include <arm_neon.h> +#else +#include "stm-arm-neon.h" +#endif +#include "stm-arm-neon-ref.h" + +/* Initialization helpers; 4 slices are needed for vld2, vld3 and + vld4. */ +#define MY_INIT_TAB(T,W,N) xNAME(INIT_TAB,N)(T##W##_t) +#define MY_INIT_TAB2(T,W,N) xNAME(INIT_TAB2,N)(T##W##_t) +#define MY_INIT_TAB3(T,W,N) xNAME(INIT_TAB3,N)(T##W##_t) +#define MY_INIT_TAB4(T,W,N) xNAME(INIT_TAB4,N)(T##W##_t) + +/* Initialized input buffers. */ +#define VECT_VAR_DECL_INIT(V, T, W, N) \ + VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,N) }; + +/* Specialized initializer with 4 entries, as used by vldX_dup and + vdup tests, which iterated 4 times on input buffers. */ +#define VECT_VAR_DECL_INIT4(V, T, W, N) \ + VECT_VAR_DECL(V,T,W,N) [] = { MY_INIT_TAB(T,W,4) }; + +/* Initializers for arrays of vectors. */ +#define VECT_ARRAY_INIT2(V, T, W, N) \ + T##W##_t VECT_ARRAY_VAR(V,T,W,N,2)[] = \ + { MY_INIT_TAB(T,W,N) \ + MY_INIT_TAB2(T,W,N) }; + +#define VECT_ARRAY_INIT3(V, T, W, N) \ + T##W##_t VECT_ARRAY_VAR(V,T,W,N,3)[] = \ + { MY_INIT_TAB(T,W,N) \ + MY_INIT_TAB2(T,W,N) \ + MY_INIT_TAB3(T,W,N) }; + +#define VECT_ARRAY_INIT4(V, T, W, N) \ + T##W##_t VECT_ARRAY_VAR(V,T,W,N,4)[] = \ + { MY_INIT_TAB(T,W,N) \ + MY_INIT_TAB2(T,W,N) \ + MY_INIT_TAB3(T,W,N) \ + MY_INIT_TAB4(T,W,N) }; + +/* Sample initialization vectors. */ +#define INIT_TAB_1(T) \ + (T)-16, +#define INIT_TAB2_1(T) \ + (T)-15, +#define INIT_TAB3_1(T) \ + (T)-14, +#define INIT_TAB4_1(T) \ + (T)-13, + +#define INIT_TAB_2(T) \ + (T)-16, (T)-15, +#define INIT_TAB2_2(T) \ + (T)-14, (T)-13, +#define INIT_TAB3_2(T) \ + (T)-12, (T)-11, +#define INIT_TAB4_2(T) \ + (T)-10, (T)-9, + +/* Initializer for vld3_lane tests. */ +#define INIT_TAB_3(T) \ + (T)-16, (T)-15, (T)-14, + +#define INIT_TAB_4(T) \ + (T)-16, (T)-15, (T)-14, (T)-13, +#define INIT_TAB2_4(T) \ + (T)-12, (T)-11, (T)-10, (T)-9, +#define INIT_TAB3_4(T) \ + (T)-8, (T)-7, (T)-6, (T)-5, +#define INIT_TAB4_4(T) \ + (T)-4, (T)-3, (T)-2, (T)-1, + +#define INIT_TAB_8(T) \ + (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9, +#define INIT_TAB2_8(T) \ + (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1, +#define INIT_TAB3_8(T) \ + (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7, +#define INIT_TAB4_8(T) \ + (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15, + +#define INIT_TAB_16(T) \ + (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9, \ + (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1, +#define INIT_TAB2_16(T) \ + (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7, \ + (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15, +#define INIT_TAB3_16(T) \ + (T)16, (T)17, (T)18, (T)19, (T)20, (T)21, (T)22, (T)23, \ + (T)24, (T)25, (T)26, (T)27, (T)28, (T)29, (T)30, (T)31, +#define INIT_TAB4_16(T) \ + (T)32, (T)33, (T)34, (T)35, (T)36, (T)37, (T)38, (T)39, \ + (T)40, (T)41, (T)42, (T)43, (T)44, (T)45, (T)46, (T)47, + +/* Input buffers, one of each size. */ +/* Insert some padding to try to exhibit out of bounds accesses. */ +VECT_VAR_DECL_INIT(buffer, int, 8, 8); +PAD(buffer_pad, int, 8, 8); +VECT_VAR_DECL_INIT(buffer, int, 16, 4); +PAD(buffer_pad, int, 16, 4); +VECT_VAR_DECL_INIT(buffer, int, 32, 2); +PAD(buffer_pad, int, 32, 2); +VECT_VAR_DECL_INIT(buffer, int, 64, 1); +PAD(buffer_pad, int, 64, 1); +VECT_VAR_DECL_INIT(buffer, uint, 8, 8); +PAD(buffer_pad, uint, 8, 8); +VECT_VAR_DECL_INIT(buffer, uint, 16, 4); +PAD(buffer_pad, uint, 16, 4); +VECT_VAR_DECL_INIT(buffer, uint, 32, 2); +PAD(buffer_pad, uint, 32, 2); +VECT_VAR_DECL_INIT(buffer, uint, 64, 1); +PAD(buffer_pad, uint, 64, 1); +VECT_VAR_DECL_INIT(buffer, float, 32, 2); +PAD(buffer_pad, float, 32, 2); +VECT_VAR_DECL_INIT(buffer, int, 8, 16); +PAD(buffer_pad, int, 8, 16); +VECT_VAR_DECL_INIT(buffer, int, 16, 8); +PAD(buffer_pad, int, 16, 8); +VECT_VAR_DECL_INIT(buffer, int, 32, 4); +PAD(buffer_pad, int, 32, 4); +VECT_VAR_DECL_INIT(buffer, int, 64, 2); +PAD(buffer_pad, int, 64, 2); +VECT_VAR_DECL_INIT(buffer, uint, 8, 16); +PAD(buffer_pad, uint, 8, 16); +VECT_VAR_DECL_INIT(buffer, uint, 16, 8); +PAD(buffer_pad, uint, 16, 8); +VECT_VAR_DECL_INIT(buffer, uint, 32, 4); +PAD(buffer_pad, uint, 32, 4); +VECT_VAR_DECL_INIT(buffer, uint, 64, 2); +PAD(buffer_pad, uint, 64, 2); +VECT_VAR_DECL_INIT(buffer, float, 32, 4); +PAD(buffer_pad, float, 32, 4); + +/* The tests for vld1_dup and vdup expect at least 4 entries in the + input buffer, so force 1- and 2-elements initializers to have 4 + entries. */ +VECT_VAR_DECL_INIT(buffer_dup, int, 8, 8); +VECT_VAR_DECL(buffer_dup_pad, int, 8, 8); +VECT_VAR_DECL_INIT(buffer_dup, int, 16, 4); +VECT_VAR_DECL(buffer_dup_pad, int, 16, 4); +VECT_VAR_DECL_INIT4(buffer_dup, int, 32, 2); +VECT_VAR_DECL(buffer_dup_pad, int, 32, 2); +VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 1); +VECT_VAR_DECL(buffer_dup_pad, int, 64, 1); +VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 8); +VECT_VAR_DECL(buffer_dup_pad, uint, 8, 8); +VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 4); +VECT_VAR_DECL(buffer_dup_pad, uint, 16, 4); +VECT_VAR_DECL_INIT4(buffer_dup, uint, 32, 2); +VECT_VAR_DECL(buffer_dup_pad, uint, 32, 2); +VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 1); +VECT_VAR_DECL(buffer_dup_pad, uint, 64, 1); +VECT_VAR_DECL_INIT4(buffer_dup, float, 32, 2); +VECT_VAR_DECL(buffer_dup_pad, float, 32, 2); +VECT_VAR_DECL_INIT(buffer_dup, int, 8, 16); +VECT_VAR_DECL(buffer_dup_pad, int, 8, 16); +VECT_VAR_DECL_INIT(buffer_dup, int, 16, 8); +VECT_VAR_DECL(buffer_dup_pad, int, 16, 8); +VECT_VAR_DECL_INIT(buffer_dup, int, 32, 4); +VECT_VAR_DECL(buffer_dup_pad, int, 32, 4); +VECT_VAR_DECL_INIT4(buffer_dup, int, 64, 2); +VECT_VAR_DECL(buffer_dup_pad, int, 64, 2); +VECT_VAR_DECL_INIT(buffer_dup, uint, 8, 16); +VECT_VAR_DECL(buffer_dup_pad, uint, 8, 16); +VECT_VAR_DECL_INIT(buffer_dup, uint, 16, 8); +VECT_VAR_DECL(buffer_dup_pad, uint, 16, 8); +VECT_VAR_DECL_INIT(buffer_dup, uint, 32, 4); +VECT_VAR_DECL(buffer_dup_pad, uint, 32, 4); +VECT_VAR_DECL_INIT4(buffer_dup, uint, 64, 2); +VECT_VAR_DECL(buffer_dup_pad, uint, 64, 2); +VECT_VAR_DECL_INIT(buffer_dup, float, 32, 4); +VECT_VAR_DECL(buffer_dup_pad, float, 32, 4); + +/* Input buffers for vld2, 1 of each size */ +VECT_ARRAY_INIT2(buffer_vld2, int, 8, 8); +PAD(buffer_vld2_pad, int, 8, 8); +VECT_ARRAY_INIT2(buffer_vld2, int, 16, 4); +PAD(buffer_vld2_pad, int, 16, 4); +VECT_ARRAY_INIT2(buffer_vld2, int, 32, 2); +PAD(buffer_vld2_pad, int, 32, 2); +VECT_ARRAY_INIT2(buffer_vld2, int, 64, 1); +PAD(buffer_vld2_pad, int, 64, 1); +VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 8); +PAD(buffer_vld2_pad, uint, 8, 8); +VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 4); +PAD(buffer_vld2_pad, uint, 16, 4); +VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 2); +PAD(buffer_vld2_pad, uint, 32, 2); +VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 1); +PAD(buffer_vld2_pad, uint, 64, 1); +VECT_ARRAY_INIT2(buffer_vld2, float, 32, 2); +PAD(buffer_vld2_pad, float, 32, 2); +VECT_ARRAY_INIT2(buffer_vld2, int, 8, 16); +PAD(buffer_vld2_pad, int, 8, 16); +VECT_ARRAY_INIT2(buffer_vld2, int, 16, 8); +PAD(buffer_vld2_pad, int, 16, 8); +VECT_ARRAY_INIT2(buffer_vld2, int, 32, 4); +PAD(buffer_vld2_pad, int, 32, 4); +VECT_ARRAY_INIT2(buffer_vld2, int, 64, 2); +PAD(buffer_vld2_pad, int, 64, 2); +VECT_ARRAY_INIT2(buffer_vld2, uint, 8, 16); +PAD(buffer_vld2_pad, uint, 8, 16); +VECT_ARRAY_INIT2(buffer_vld2, uint, 16, 8); +PAD(buffer_vld2_pad, uint, 16, 8); +VECT_ARRAY_INIT2(buffer_vld2, uint, 32, 4); +PAD(buffer_vld2_pad, uint, 32, 4); +VECT_ARRAY_INIT2(buffer_vld2, uint, 64, 2); +PAD(buffer_vld2_pad, uint, 64, 2); +VECT_ARRAY_INIT2(buffer_vld2, float, 32, 4); +PAD(buffer_vld2_pad, float, 32, 4); + +/* Input buffers for vld3, 1 of each size */ +VECT_ARRAY_INIT3(buffer_vld3, int, 8, 8); +PAD(buffer_vld3_pad, int, 8, 8); +VECT_ARRAY_INIT3(buffer_vld3, int, 16, 4); +PAD(buffer_vld3_pad, int, 16, 4); +VECT_ARRAY_INIT3(buffer_vld3, int, 32, 2); +PAD(buffer_vld3_pad, int, 32, 2); +VECT_ARRAY_INIT3(buffer_vld3, int, 64, 1); +PAD(buffer_vld3_pad, int, 64, 1); +VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 8); +PAD(buffer_vld3_pad, uint, 8, 8); +VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 4); +PAD(buffer_vld3_pad, uint, 16, 4); +VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 2); +PAD(buffer_vld3_pad, uint, 32, 2); +VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 1); +PAD(buffer_vld3_pad, uint, 64, 1); +VECT_ARRAY_INIT3(buffer_vld3, float, 32, 2); +PAD(buffer_vld3_pad, float, 32, 2); +VECT_ARRAY_INIT3(buffer_vld3, int, 8, 16); +PAD(buffer_vld3_pad, int, 8, 16); +VECT_ARRAY_INIT3(buffer_vld3, int, 16, 8); +PAD(buffer_vld3_pad, int, 16, 8); +VECT_ARRAY_INIT3(buffer_vld3, int, 32, 4); +PAD(buffer_vld3_pad, int, 32, 4); +VECT_ARRAY_INIT3(buffer_vld3, int, 64, 2); +PAD(buffer_vld3_pad, int, 64, 2); +VECT_ARRAY_INIT3(buffer_vld3, uint, 8, 16); +PAD(buffer_vld3_pad, uint, 8, 16); +VECT_ARRAY_INIT3(buffer_vld3, uint, 16, 8); +PAD(buffer_vld3_pad, uint, 16, 8); +VECT_ARRAY_INIT3(buffer_vld3, uint, 32, 4); +PAD(buffer_vld3_pad, uint, 32, 4); +VECT_ARRAY_INIT3(buffer_vld3, uint, 64, 2); +PAD(buffer_vld3_pad, uint, 64, 2); +VECT_ARRAY_INIT3(buffer_vld3, float, 32, 4); +PAD(buffer_vld3_pad, float, 32, 4); + +/* Input buffers for vld4, 1 of each size */ +VECT_ARRAY_INIT4(buffer_vld4, int, 8, 8); +PAD(buffer_vld4_pad, int, 8, 8); +VECT_ARRAY_INIT4(buffer_vld4, int, 16, 4); +PAD(buffer_vld4_pad, int, 16, 4); +VECT_ARRAY_INIT4(buffer_vld4, int, 32, 2); +PAD(buffer_vld4_pad, int, 32, 2); +VECT_ARRAY_INIT4(buffer_vld4, int, 64, 1); +PAD(buffer_vld4_pad, int, 64, 1); +VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 8); +PAD(buffer_vld4_pad, uint, 8, 8); +VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 4); +PAD(buffer_vld4_pad, uint, 16, 4); +VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 2); +PAD(buffer_vld4_pad, uint, 32, 2); +VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 1); +PAD(buffer_vld4_pad, uint, 64, 1); +VECT_ARRAY_INIT4(buffer_vld4, float, 32, 2); +PAD(buffer_vld4_pad, float, 32, 2); +VECT_ARRAY_INIT4(buffer_vld4, int, 8, 16); +PAD(buffer_vld4_pad, int, 8, 16); +VECT_ARRAY_INIT4(buffer_vld4, int, 16, 8); +PAD(buffer_vld4_pad, int, 16, 8); +VECT_ARRAY_INIT4(buffer_vld4, int, 32, 4); +PAD(buffer_vld4_pad, int, 32, 4); +VECT_ARRAY_INIT4(buffer_vld4, int, 64, 2); +PAD(buffer_vld4_pad, int, 64, 2); +VECT_ARRAY_INIT4(buffer_vld4, uint, 8, 16); +PAD(buffer_vld4_pad, uint, 8, 16); +VECT_ARRAY_INIT4(buffer_vld4, uint, 16, 8); +PAD(buffer_vld4_pad, uint, 16, 8); +VECT_ARRAY_INIT4(buffer_vld4, uint, 32, 4); +PAD(buffer_vld4_pad, uint, 32, 4); +VECT_ARRAY_INIT4(buffer_vld4, uint, 64, 2); +PAD(buffer_vld4_pad, uint, 64, 2); +VECT_ARRAY_INIT4(buffer_vld4, float, 32, 4); +PAD(buffer_vld4_pad, float, 32, 4); + +/* Input buffers for vld2_lane */ +VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 8, 2); +VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 16, 2); +VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 32, 2); +VECT_VAR_DECL_INIT(buffer_vld2_lane, int, 64, 2); +VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 8, 2); +VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 16, 2); +VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 32, 2); +VECT_VAR_DECL_INIT(buffer_vld2_lane, uint, 64, 2); +VECT_VAR_DECL_INIT(buffer_vld2_lane, float, 32, 2); + +/* Input buffers for vld3_lane */ +VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 8, 3); +VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 16, 3); +VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 32, 3); +VECT_VAR_DECL_INIT(buffer_vld3_lane, int, 64, 3); +VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 8, 3); +VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 16, 3); +VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 32, 3); +VECT_VAR_DECL_INIT(buffer_vld3_lane, uint, 64, 3); +VECT_VAR_DECL_INIT(buffer_vld3_lane, float, 32, 3); + +/* Input buffers for vld4_lane */ +VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 8, 4); +VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 16, 4); +VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 32, 4); +VECT_VAR_DECL_INIT(buffer_vld4_lane, int, 64, 4); +VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 8, 4); +VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 16, 4); +VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 32, 4); +VECT_VAR_DECL_INIT(buffer_vld4_lane, uint, 64, 4); +VECT_VAR_DECL_INIT(buffer_vld4_lane, float, 32, 4); diff --git a/ref-rvct-neon.txt b/ref-rvct-neon.txt index 010e138..8ef013c 100644 --- a/ref-rvct-neon.txt +++ b/ref-rvct-neon.txt @@ -2014,11 +2014,11 @@ VLD3/VLD3Q:41:result_uint16x4 [] = { fff8, fff9, fffa, fffb, } VLD3/VLD3Q:42:result_uint32x2 [] = { fffffff4, fffffff5, } VLD3/VLD3Q:43:result_uint64x1 [] = { fffffffffffffff2, } VLD3/VLD3Q:44:result_float32x2 [] = { c1400000 -0x1.8000000p+3 -12, c1300000 -0x1.6000000p+3 -11, } -VLD3/VLD3Q:45:result_int8x16 [] = { fffffff0, ffffffff, fffffff1, ffffffff, fffffff2, ffffffff, fffffff3, ffffffff, fffffff4, ffffffff, fffffff5, ffffffff, fffffff6, ffffffff, fffffff7, ffffffff, } +VLD3/VLD3Q:45:result_int8x16 [] = { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f, } VLD3/VLD3Q:46:result_int16x8 [] = { 0, 1, 2, 3, 4, 5, 6, 7, } VLD3/VLD3Q:47:result_int32x4 [] = { fffffff8, fffffff9, fffffffa, fffffffb, } VLD3/VLD3Q:48:result_int64x2 [] = { 3333333333333333, 3333333333333333, } -VLD3/VLD3Q:49:result_uint8x16 [] = { f0, ff, f1, ff, f2, ff, f3, ff, f4, ff, f5, ff, f6, ff, f7, ff, } +VLD3/VLD3Q:49:result_uint8x16 [] = { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f, } VLD3/VLD3Q:50:result_uint16x8 [] = { 0, 1, 2, 3, 4, 5, 6, 7, } VLD3/VLD3Q:51:result_uint32x4 [] = { fffffff8, fffffff9, fffffffa, fffffffb, } VLD3/VLD3Q:52:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } @@ -2074,11 +2074,11 @@ VLD4/VLD4Q:41:result_uint16x4 [] = { fff8, fff9, fffa, fffb, } VLD4/VLD4Q:42:result_uint32x2 [] = { fffffff4, fffffff5, } VLD4/VLD4Q:43:result_uint64x1 [] = { fffffffffffffff2, } VLD4/VLD4Q:44:result_float32x2 [] = { c1400000 -0x1.8000000p+3 -12, c1300000 -0x1.6000000p+3 -11, } -VLD4/VLD4Q:45:result_int8x16 [] = { fffffff0, ffffffff, fffffff1, ffffffff, fffffff2, ffffffff, fffffff3, ffffffff, fffffff4, ffffffff, fffffff5, ffffffff, fffffff6, ffffffff, fffffff7, ffffffff, } +VLD4/VLD4Q:45:result_int8x16 [] = { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f, } VLD4/VLD4Q:46:result_int16x8 [] = { 0, 1, 2, 3, 4, 5, 6, 7, } VLD4/VLD4Q:47:result_int32x4 [] = { fffffff8, fffffff9, fffffffa, fffffffb, } VLD4/VLD4Q:48:result_int64x2 [] = { 3333333333333333, 3333333333333333, } -VLD4/VLD4Q:49:result_uint8x16 [] = { f0, ff, f1, ff, f2, ff, f3, ff, f4, ff, f5, ff, f6, ff, f7, ff, } +VLD4/VLD4Q:49:result_uint8x16 [] = { 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f, } VLD4/VLD4Q:50:result_uint16x8 [] = { 0, 1, 2, 3, 4, 5, 6, 7, } VLD4/VLD4Q:51:result_uint32x4 [] = { fffffff8, fffffff9, fffffffa, fffffffb, } VLD4/VLD4Q:52:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } @@ -2094,11 +2094,11 @@ VLD4/VLD4Q:59:result_uint16x4 [] = { fffc, fffd, fffe, ffff, } VLD4/VLD4Q:60:result_uint32x2 [] = { fffffff6, fffffff7, } VLD4/VLD4Q:61:result_uint64x1 [] = { fffffffffffffff3, } VLD4/VLD4Q:62:result_float32x2 [] = { c1200000 -0x1.4000000p+3 -10, c1100000 -0x1.2000000p+3 -9, } -VLD4/VLD4Q:63:result_int8x16 [] = { fffffff8, ffffffff, fffffff9, ffffffff, fffffffa, ffffffff, fffffffb, ffffffff, fffffffc, ffffffff, fffffffd, ffffffff, fffffffe, ffffffff, ffffffff, ffffffff, } +VLD4/VLD4Q:63:result_int8x16 [] = { 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 2a, 2b, 2c, 2d, 2e, 2f, } VLD4/VLD4Q:64:result_int16x8 [] = { 8, 9, a, b, c, d, e, f, } VLD4/VLD4Q:65:result_int32x4 [] = { fffffffc, fffffffd, fffffffe, ffffffff, } VLD4/VLD4Q:66:result_int64x2 [] = { 3333333333333333, 3333333333333333, } -VLD4/VLD4Q:67:result_uint8x16 [] = { f8, ff, f9, ff, fa, ff, fb, ff, fc, ff, fd, ff, fe, ff, ff, ff, } +VLD4/VLD4Q:67:result_uint8x16 [] = { 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 2a, 2b, 2c, 2d, 2e, 2f, } VLD4/VLD4Q:68:result_uint16x8 [] = { 8, 9, a, b, c, d, e, f, } VLD4/VLD4Q:69:result_uint32x4 [] = { fffffffc, fffffffd, fffffffe, ffffffff, } VLD4/VLD4Q:70:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } @@ -2831,24 +2831,24 @@ VTRN/VTRNQ:16:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } VTRN/VTRNQ:17:result_float32x4 [] = { c1800000 -0x1.0000000p+4 -16, c1700000 -0x1.e000000p+3 -15, 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, } VTRN/VTRNQ chunk 1 output: -VTRN/VTRNQ:18:result_int8x8 [] = { fffffff1, 11, 11, fffffff2, fffffff3, 11, 11, fffffff4, } -VTRN/VTRNQ:19:result_int16x4 [] = { fffffff1, 22, 22, fffffff2, } -VTRN/VTRNQ:20:result_int32x2 [] = { fffffff1, 33, } +VTRN/VTRNQ:18:result_int8x8 [] = { fffffff4, fffffff5, 11, 11, fffffff6, fffffff7, 11, 11, } +VTRN/VTRNQ:19:result_int16x4 [] = { fffffff2, fffffff3, 22, 22, } +VTRN/VTRNQ:20:result_int32x2 [] = { 33, 33, } VTRN/VTRNQ:21:result_int64x1 [] = { 3333333333333333, } -VTRN/VTRNQ:22:result_uint8x8 [] = { f1, 55, 55, f2, f3, 55, 55, f4, } -VTRN/VTRNQ:23:result_uint16x4 [] = { fff1, 66, 66, fff2, } -VTRN/VTRNQ:24:result_uint32x2 [] = { fffffff1, 77, } +VTRN/VTRNQ:22:result_uint8x8 [] = { f4, f5, 55, 55, f6, f7, 55, 55, } +VTRN/VTRNQ:23:result_uint16x4 [] = { fff2, fff3, 66, 66, } +VTRN/VTRNQ:24:result_uint32x2 [] = { 77, 77, } VTRN/VTRNQ:25:result_uint64x1 [] = { 3333333333333333, } -VTRN/VTRNQ:26:result_float32x2 [] = { c1700000 -0x1.e000000p+3 -15, 42066666 0x1.0ccccc0p+5 33.6, } -VTRN/VTRNQ:27:result_int8x16 [] = { fffffff1, 11, 11, fffffff2, fffffff3, 11, 11, fffffff4, fffffff5, 11, 11, fffffff6, fffffff7, 11, 11, fffffff8, } -VTRN/VTRNQ:28:result_int16x8 [] = { fffffff1, 22, 22, fffffff2, fffffff3, 22, 22, fffffff4, } -VTRN/VTRNQ:29:result_int32x4 [] = { fffffff1, 33, 33, fffffff2, } +VTRN/VTRNQ:26:result_float32x2 [] = { 42066666 0x1.0ccccc0p+5 33.6, 42066666 0x1.0ccccc0p+5 33.6, } +VTRN/VTRNQ:27:result_int8x16 [] = { fffffff8, fffffff9, 11, 11, fffffffa, fffffffb, 11, 11, fffffffc, fffffffd, 11, 11, fffffffe, ffffffff, 11, 11, } +VTRN/VTRNQ:28:result_int16x8 [] = { fffffff4, fffffff5, 22, 22, fffffff6, fffffff7, 22, 22, } +VTRN/VTRNQ:29:result_int32x4 [] = { fffffff2, fffffff3, 33, 33, } VTRN/VTRNQ:30:result_int64x2 [] = { 3333333333333333, 3333333333333333, } -VTRN/VTRNQ:31:result_uint8x16 [] = { f1, 55, 55, f2, f3, 55, 55, f4, f5, 55, 55, f6, f7, 55, 55, f8, } -VTRN/VTRNQ:32:result_uint16x8 [] = { fff1, 66, 66, fff2, fff3, 66, 66, fff4, } -VTRN/VTRNQ:33:result_uint32x4 [] = { fffffff1, 77, 77, fffffff2, } +VTRN/VTRNQ:31:result_uint8x16 [] = { f8, f9, 55, 55, fa, fb, 55, 55, fc, fd, 55, 55, fe, ff, 55, 55, } +VTRN/VTRNQ:32:result_uint16x8 [] = { fff4, fff5, 66, 66, fff6, fff7, 66, 66, } +VTRN/VTRNQ:33:result_uint32x4 [] = { fffffff2, fffffff3, 77, 77, } VTRN/VTRNQ:34:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } -VTRN/VTRNQ:35:result_float32x4 [] = { c1700000 -0x1.e000000p+3 -15, 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, c1600000 -0x1.c000000p+3 -14, } +VTRN/VTRNQ:35:result_float32x4 [] = { c1600000 -0x1.c000000p+3 -14, c1500000 -0x1.a000000p+3 -13, 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, } VUZP/VUZPQ chunk 0 output: VUZP/VUZPQ:0:result_int8x8 [] = { fffffff0, fffffff1, fffffff2, fffffff3, fffffff4, fffffff5, fffffff6, fffffff7, } @@ -2871,24 +2871,24 @@ VUZP/VUZPQ:16:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } VUZP/VUZPQ:17:result_float32x4 [] = { c1800000 -0x1.0000000p+4 -16, c1700000 -0x1.e000000p+3 -15, c1600000 -0x1.c000000p+3 -14, c1500000 -0x1.a000000p+3 -13, } VUZP/VUZPQ chunk 1 output: -VUZP/VUZPQ:18:result_int8x8 [] = { fffffff1, fffffff2, fffffff3, fffffff4, fffffff5, fffffff6, fffffff7, 11, } -VUZP/VUZPQ:19:result_int16x4 [] = { fffffff1, fffffff2, fffffff3, 22, } -VUZP/VUZPQ:20:result_int32x2 [] = { fffffff1, 33, } +VUZP/VUZPQ:18:result_int8x8 [] = { 11, 11, 11, 11, 11, 11, 11, 11, } +VUZP/VUZPQ:19:result_int16x4 [] = { 22, 22, 22, 22, } +VUZP/VUZPQ:20:result_int32x2 [] = { 33, 33, } VUZP/VUZPQ:21:result_int64x1 [] = { 3333333333333333, } -VUZP/VUZPQ:22:result_uint8x8 [] = { f1, f2, f3, f4, f5, f6, f7, 55, } -VUZP/VUZPQ:23:result_uint16x4 [] = { fff1, fff2, fff3, 66, } -VUZP/VUZPQ:24:result_uint32x2 [] = { fffffff1, 77, } +VUZP/VUZPQ:22:result_uint8x8 [] = { 55, 55, 55, 55, 55, 55, 55, 55, } +VUZP/VUZPQ:23:result_uint16x4 [] = { 66, 66, 66, 66, } +VUZP/VUZPQ:24:result_uint32x2 [] = { 77, 77, } VUZP/VUZPQ:25:result_uint64x1 [] = { 3333333333333333, } -VUZP/VUZPQ:26:result_float32x2 [] = { c1700000 -0x1.e000000p+3 -15, 42066666 0x1.0ccccc0p+5 33.6, } -VUZP/VUZPQ:27:result_int8x16 [] = { fffffff1, fffffff2, fffffff3, fffffff4, fffffff5, fffffff6, fffffff7, fffffff8, fffffff9, fffffffa, fffffffb, fffffffc, fffffffd, fffffffe, ffffffff, 11, } -VUZP/VUZPQ:28:result_int16x8 [] = { fffffff1, fffffff2, fffffff3, fffffff4, fffffff5, fffffff6, fffffff7, 22, } -VUZP/VUZPQ:29:result_int32x4 [] = { fffffff1, fffffff2, fffffff3, 33, } +VUZP/VUZPQ:26:result_float32x2 [] = { 42066666 0x1.0ccccc0p+5 33.6, 42066666 0x1.0ccccc0p+5 33.6, } +VUZP/VUZPQ:27:result_int8x16 [] = { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, } +VUZP/VUZPQ:28:result_int16x8 [] = { 22, 22, 22, 22, 22, 22, 22, 22, } +VUZP/VUZPQ:29:result_int32x4 [] = { 33, 33, 33, 33, } VUZP/VUZPQ:30:result_int64x2 [] = { 3333333333333333, 3333333333333333, } -VUZP/VUZPQ:31:result_uint8x16 [] = { f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff, 55, } -VUZP/VUZPQ:32:result_uint16x8 [] = { fff1, fff2, fff3, fff4, fff5, fff6, fff7, 66, } -VUZP/VUZPQ:33:result_uint32x4 [] = { fffffff1, fffffff2, fffffff3, 77, } +VUZP/VUZPQ:31:result_uint8x16 [] = { 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, } +VUZP/VUZPQ:32:result_uint16x8 [] = { 66, 66, 66, 66, 66, 66, 66, 66, } +VUZP/VUZPQ:33:result_uint32x4 [] = { 77, 77, 77, 77, } VUZP/VUZPQ:34:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } -VUZP/VUZPQ:35:result_float32x4 [] = { c1700000 -0x1.e000000p+3 -15, c1600000 -0x1.c000000p+3 -14, c1500000 -0x1.a000000p+3 -13, 42073333 0x1.0e66660p+5 33.8, } +VUZP/VUZPQ:35:result_float32x4 [] = { 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, } VZIP/VZIPQ chunk 0 output: VZIP/VZIPQ:0:result_int8x8 [] = { fffffff0, fffffff4, 11, 11, fffffff1, fffffff5, 11, 11, } @@ -2911,24 +2911,24 @@ VZIP/VZIPQ:16:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } VZIP/VZIPQ:17:result_float32x4 [] = { c1800000 -0x1.0000000p+4 -16, c1600000 -0x1.c000000p+3 -14, 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, } VZIP/VZIPQ chunk 1 output: -VZIP/VZIPQ:18:result_int8x8 [] = { fffffff4, 11, 11, fffffff1, fffffff5, 11, 11, fffffff2, } -VZIP/VZIPQ:19:result_int16x4 [] = { fffffff2, 22, 22, fffffff1, } -VZIP/VZIPQ:20:result_int32x2 [] = { fffffff1, 33, } +VZIP/VZIPQ:18:result_int8x8 [] = { fffffff2, fffffff6, 11, 11, fffffff3, fffffff7, 11, 11, } +VZIP/VZIPQ:19:result_int16x4 [] = { fffffff1, fffffff3, 22, 22, } +VZIP/VZIPQ:20:result_int32x2 [] = { 33, 33, } VZIP/VZIPQ:21:result_int64x1 [] = { 3333333333333333, } -VZIP/VZIPQ:22:result_uint8x8 [] = { f4, 55, 55, f1, f5, 55, 55, f2, } -VZIP/VZIPQ:23:result_uint16x4 [] = { fff2, 66, 66, fff1, } -VZIP/VZIPQ:24:result_uint32x2 [] = { fffffff1, 77, } +VZIP/VZIPQ:22:result_uint8x8 [] = { f2, f6, 55, 55, f3, f7, 55, 55, } +VZIP/VZIPQ:23:result_uint16x4 [] = { fff1, fff3, 66, 66, } +VZIP/VZIPQ:24:result_uint32x2 [] = { 77, 77, } VZIP/VZIPQ:25:result_uint64x1 [] = { 3333333333333333, } -VZIP/VZIPQ:26:result_float32x2 [] = { c1700000 -0x1.e000000p+3 -15, 42066666 0x1.0ccccc0p+5 33.6, } -VZIP/VZIPQ:27:result_int8x16 [] = { fffffff8, 11, 11, fffffff1, fffffff9, 11, 11, fffffff2, fffffffa, 11, 11, fffffff3, fffffffb, 11, 11, fffffff4, } -VZIP/VZIPQ:28:result_int16x8 [] = { fffffff4, 22, 22, fffffff1, fffffff5, 22, 22, fffffff2, } -VZIP/VZIPQ:29:result_int32x4 [] = { fffffff2, 33, 33, fffffff1, } +VZIP/VZIPQ:26:result_float32x2 [] = { 42066666 0x1.0ccccc0p+5 33.6, 42066666 0x1.0ccccc0p+5 33.6, } +VZIP/VZIPQ:27:result_int8x16 [] = { fffffff4, fffffffc, 11, 11, fffffff5, fffffffd, 11, 11, fffffff6, fffffffe, 11, 11, fffffff7, ffffffff, 11, 11, } +VZIP/VZIPQ:28:result_int16x8 [] = { fffffff2, fffffff6, 22, 22, fffffff3, fffffff7, 22, 22, } +VZIP/VZIPQ:29:result_int32x4 [] = { fffffff1, fffffff3, 33, 33, } VZIP/VZIPQ:30:result_int64x2 [] = { 3333333333333333, 3333333333333333, } -VZIP/VZIPQ:31:result_uint8x16 [] = { f8, 55, 55, f1, f9, 55, 55, f2, fa, 55, 55, f3, fb, 55, 55, f4, } -VZIP/VZIPQ:32:result_uint16x8 [] = { fff4, 66, 66, fff1, fff5, 66, 66, fff2, } -VZIP/VZIPQ:33:result_uint32x4 [] = { fffffff2, 77, 77, fffffff1, } +VZIP/VZIPQ:31:result_uint8x16 [] = { f4, fc, 55, 55, f5, fd, 55, 55, f6, fe, 55, 55, f7, ff, 55, 55, } +VZIP/VZIPQ:32:result_uint16x8 [] = { fff2, fff6, 66, 66, fff3, fff7, 66, 66, } +VZIP/VZIPQ:33:result_uint32x4 [] = { fffffff1, fffffff3, 77, 77, } VZIP/VZIPQ:34:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } -VZIP/VZIPQ:35:result_float32x4 [] = { c1600000 -0x1.c000000p+3 -14, 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, c1700000 -0x1.e000000p+3 -15, } +VZIP/VZIPQ:35:result_float32x4 [] = { c1700000 -0x1.e000000p+3 -15, c1500000 -0x1.a000000p+3 -13, 42073333 0x1.0e66660p+5 33.8, 42073333 0x1.0e66660p+5 33.8, } VREINTERPRET/VREINTERPRETQ output: VREINTERPRET/VREINTERPRETQ:0:result_int8x8 [] = { fffffff0, ffffffff, fffffff1, ffffffff, fffffff2, ffffffff, fffffff3, ffffffff, } @@ -5390,6 +5390,26 @@ VCLZ/VCLZQ:15:result_uint32x4 [] = { 1f, 1f, 1f, 1f, } VCLZ/VCLZQ:16:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } VCLZ/VCLZQ:17:result_float32x4 [] = { 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, } +VCLZ/VCLZQ (input=0) output: +VCLZ/VCLZQ:18:result_int8x8 [] = { 8, 8, 8, 8, 8, 8, 8, 8, } +VCLZ/VCLZQ:19:result_int16x4 [] = { 10, 10, 10, 10, } +VCLZ/VCLZQ:20:result_int32x2 [] = { 20, 20, } +VCLZ/VCLZQ:21:result_int64x1 [] = { 3333333333333333, } +VCLZ/VCLZQ:22:result_uint8x8 [] = { 8, 8, 8, 8, 8, 8, 8, 8, } +VCLZ/VCLZQ:23:result_uint16x4 [] = { 10, 10, 10, 10, } +VCLZ/VCLZQ:24:result_uint32x2 [] = { 20, 20, } +VCLZ/VCLZQ:25:result_uint64x1 [] = { 3333333333333333, } +VCLZ/VCLZQ:26:result_float32x2 [] = { 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, } +VCLZ/VCLZQ:27:result_int8x16 [] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, } +VCLZ/VCLZQ:28:result_int16x8 [] = { 10, 10, 10, 10, 10, 10, 10, 10, } +VCLZ/VCLZQ:29:result_int32x4 [] = { 20, 20, 20, 20, } +VCLZ/VCLZQ:30:result_int64x2 [] = { 3333333333333333, 3333333333333333, } +VCLZ/VCLZQ:31:result_uint8x16 [] = { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, } +VCLZ/VCLZQ:32:result_uint16x8 [] = { 10, 10, 10, 10, 10, 10, 10, 10, } +VCLZ/VCLZQ:33:result_uint32x4 [] = { 20, 20, 20, 20, } +VCLZ/VCLZQ:34:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } +VCLZ/VCLZQ:35:result_float32x4 [] = { 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, } + VCLS/VCLSQ (positive input) output: VCLS/VCLSQ:0:result_int8x8 [] = { 6, 6, 6, 6, 6, 6, 6, 6, } VCLS/VCLSQ:1:result_int16x4 [] = { 2, 2, 2, 2, } @@ -5410,7 +5430,7 @@ VCLS/VCLSQ:15:result_uint32x4 [] = { 33333333, 33333333, 33333333, 33333333, } VCLS/VCLSQ:16:result_uint64x2 [] = { 3333333333333333, 3333333333333333, } VCLS/VCLSQ:17:result_float32x4 [] = { 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, 33333333 0x1.6666660p-25 4.172325e-08, } -VCLS/VCLSQ (positive input) output: +VCLS/VCLSQ (negative input) output: VCLS/VCLSQ:18:result_int8x8 [] = { 7, 7, 7, 7, 7, 7, 7, 7, } VCLS/VCLSQ:19:result_int16x4 [] = { 1, 1, 1, 1, } VCLS/VCLSQ:20:result_int32x2 [] = { 1, 1, } @@ -40,13 +40,13 @@ void exec_vdup (void) #undef TEST_VDUP #define TEST_VDUP(Q, T1, T2, W, N) \ VECT_VAR(vector, T1, W, N) = \ - vdup##Q##_n_##T2##W(VECT_VAR(buffer, T1, W, N)[i]); \ + vdup##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]); \ vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector, T1, W, N)) /* Basic test: vec=vmov(x), then store the result. */ #define TEST_VMOV(Q, T1, T2, W, N) \ VECT_VAR(vector, T1, W, N) = \ - vmov##Q##_n_##T2##W(VECT_VAR(buffer, T1, W, N)[i]); \ + vmov##Q##_n_##T2##W(VECT_VAR(buffer_dup, T1, W, N)[i]); \ vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N), VECT_VAR(vector, T1, W, N)) /* With ARM RVCT, we need to declare variables before any executable diff --git a/ref_vld1_dup.c b/ref_vld1_dup.c index 2115f96..5134da1 100644 --- a/ref_vld1_dup.c +++ b/ref_vld1_dup.c @@ -50,10 +50,10 @@ void exec_vld1_dup (void) for (i=0; i<3; i++) { clean_results (); - TEST_MACRO_ALL_VARIANTS_2_5(TEST_VLD1_DUP, vector, buffer); + TEST_MACRO_ALL_VARIANTS_2_5(TEST_VLD1_DUP, vector, buffer_dup); - TEST_VLD1_DUP(vector, buffer, , float, f, 32, 2); - TEST_VLD1_DUP(vector, buffer, q, float, f, 32, 4); + TEST_VLD1_DUP(vector, buffer_dup, , float, f, 32, 2); + TEST_VLD1_DUP(vector, buffer_dup, q, float, f, 32, 4); dump_results_hex (TEST_MSG); } @@ -46,7 +46,8 @@ void exec_vldX (void) result. */ #define TEST_VLDX(Q, T1, T2, W, N, X) \ VECT_ARRAY_VAR(vector, T1, W, N, X) = \ - vld##X##Q##_##T2##W(VECT_VAR(buffer, T1, W, N)); \ + /* Use dedicated init buffer, of size X */ \ + vld##X##Q##_##T2##W(VECT_ARRAY_VAR(buffer_vld##X, T1, W, N, X)); \ vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \ VECT_ARRAY_VAR(vector, T1, W, N, X)); \ memcpy(VECT_VAR(result, T1, W, N), VECT_VAR(result_bis_##X, T1, W, N), \ diff --git a/ref_vldX_dup.c b/ref_vldX_dup.c index 5f9b1eb..1b8a3e8 100644 --- a/ref_vldX_dup.c +++ b/ref_vldX_dup.c @@ -47,7 +47,8 @@ void exec_vldX_dup (void) /* Fill vector with buffer item #i */ #define TEST_VLDX_DUP(Q, T1, T2, W, N, X) \ VECT_ARRAY_VAR(vector, T1, W, N, X) = \ - vld##X##Q##_dup_##T2##W(VECT_VAR(buffer, T1, W, N)); \ + /* Use dedicated init buffer, of size X */ \ + vld##X##Q##_dup_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X)); \ \ vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \ VECT_ARRAY_VAR(vector, T1, W, N, X)); \ diff --git a/ref_vldX_lane.c b/ref_vldX_lane.c index 0b86971..7ad3d31 100644 --- a/ref_vldX_lane.c +++ b/ref_vldX_lane.c @@ -53,7 +53,8 @@ void exec_vldX_lane (void) vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N)); \ \ VECT_ARRAY_VAR(vector, T1, W, N, X) = \ - vld##X##Q##_lane_##T2##W(VECT_VAR(buffer, T1, W, N), \ + /* Use dedicated init buffer, of size X */ \ + vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X), \ VECT_ARRAY_VAR(vector_src, T1, W, N, X), \ L); \ vst##X##Q##_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \ @@ -85,7 +86,13 @@ void exec_vldX_lane (void) DECL_VLDX_LANE(float, 32, 2, X); \ DECL_VLDX_LANE(float, 32, 4, X) -#define DUMMY_ARRAY(V, T, W, N, L) VECT_VAR_DECL(V,T,W,N)[N*L] + /* Add some padding to try to catch out of bound accesses. */ + /* Use an array instead of a plain char to comply with rvct + constraints. */ +#define ARRAY1(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[1]={42} +#define DUMMY_ARRAY(V, T, W, N, L) \ + VECT_VAR_DECL(V,T,W,N)[N*L]={0}; \ + ARRAY1(V##_pad,T,W,N) /* Use the same lanes regardless of the size of the array (X), for simplicity */ diff --git a/ref_vstX_lane.c b/ref_vstX_lane.c index e0c3ce7..2d15d34 100644 --- a/ref_vstX_lane.c +++ b/ref_vstX_lane.c @@ -55,7 +55,8 @@ void exec_vstX_lane (void) vld##X##Q##_##T2##W(VECT_VAR(buffer_src, T1, W, N)); \ \ VECT_ARRAY_VAR(vector, T1, W, N, X) = \ - vld##X##Q##_lane_##T2##W(VECT_VAR(buffer, T1, W, N), \ + /* Use dedicated init buffer, of size X */ \ + vld##X##Q##_lane_##T2##W(VECT_VAR(buffer_vld##X##_lane, T1, W, X), \ VECT_ARRAY_VAR(vector_src, T1, W, N, X), \ L); \ vst##X##Q##_lane_##T2##W(VECT_VAR(result_bis_##X, T1, W, N), \ diff --git a/stm-arm-neon-ref.h b/stm-arm-neon-ref.h index 2d38742..dc53ef3 100644 --- a/stm-arm-neon-ref.h +++ b/stm-arm-neon-ref.h @@ -62,10 +62,17 @@ static int32_t _ptrInf[]={0x7f800000L}; #define VECT_VAR(V,T,W,N) xNAME(V,VECT_NAME(T,W,N)) #define VECT_VAR_DECL(V, T, W, N) T##W##_t VECT_VAR(V,T,W,N) -#define VECT_VAR_DECL_INIT(V, T, W, N) T##W##_t VECT_VAR(V,T,W,N) INIT_TAB(T##W##_t) + +/* This one is used for padding between input buffers. */ +#define PAD(V, T, W, N) char VECT_VAR(V,T,W,N)=42; + +/* Array declarations. */ #define ARRAY(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[N] +#define ARRAY4(V, T, W, N) VECT_VAR_DECL(V,T,W,N)[4] +/* Arrays of vectors. */ #define VECT_ARRAY_VAR(V,T,W,N,L) xNAME(V,VECT_ARRAY_NAME(T,W,N,L)) +#define VECT_ARRAY(V, T, W, N, L) T##W##_t VECT_ARRAY_VAR(V,T,W,N,L)[N*L] static int result_idx = 0; #define DUMP(MSG,T,W,N,FMT) \ @@ -123,37 +130,142 @@ static int result_idx = 0; extern FILE* log_file; extern FILE* ref_file; -/* Sample initialization vectors. For simplicity, use the same one for - each vector size (it's not a problem if it's too large), and have - it large enough for the vld4 input samples. */ -#define INIT_TAB(T) [] = { \ - (T)-16, (T)-15, (T)-14, (T)-13, (T)-12, (T)-11, (T)-10, (T)-9, (T)-8, (T)-7, (T)-6, (T)-5, (T)-4, (T)-3, (T)-2, (T)-1, \ - (T)0, (T)1, (T)2, (T)3, (T)4, (T)5, (T)6, (T)7, (T)8, (T)9, (T)10, (T)11, (T)12, (T)13, (T)14, (T)15, \ - (T)16, (T)17,(T)18,(T)19,(T)20, (T)21, (T)22, (T)23, (T)24, (T)25, (T)26, (T)27, (T)28, (T)29, (T)30, (T)31, \ - (T)32, (T)33,(T)34,(T)35,(T)36, (T)37, (T)38, (T)39, (T)40, (T)41, (T)42, (T)43, (T)44, (T)45, (T)46, (T)47, \ - } - -/* Input buffers, 1 of each size */ -static VECT_VAR_DECL_INIT(buffer, int, 8, 8); -static VECT_VAR_DECL_INIT(buffer, int, 16, 4); -static VECT_VAR_DECL_INIT(buffer, int, 32, 2); -static VECT_VAR_DECL_INIT(buffer, int, 64, 1); -static VECT_VAR_DECL_INIT(buffer, uint, 8, 8); -static VECT_VAR_DECL_INIT(buffer, uint, 16, 4); -static VECT_VAR_DECL_INIT(buffer, uint, 32, 2); -static VECT_VAR_DECL_INIT(buffer, uint, 64, 1); -static VECT_VAR_DECL_INIT(buffer, float, 32, 2); -static VECT_VAR_DECL_INIT(buffer, int, 8, 16); -static VECT_VAR_DECL_INIT(buffer, int, 16, 8); -static VECT_VAR_DECL_INIT(buffer, int, 32, 4); -static VECT_VAR_DECL_INIT(buffer, int, 64, 2); -static VECT_VAR_DECL_INIT(buffer, uint, 8, 16); -static VECT_VAR_DECL_INIT(buffer, uint, 16, 8); -static VECT_VAR_DECL_INIT(buffer, uint, 32, 4); -static VECT_VAR_DECL_INIT(buffer, uint, 64, 2); -static VECT_VAR_DECL_INIT(buffer, float, 32, 4); - -/* Output buffers, 1 of each size */ +/* Input buffers, one of each size */ +extern ARRAY(buffer, int, 8, 8); +extern ARRAY(buffer, int, 16, 4); +extern ARRAY(buffer, int, 32, 2); +extern ARRAY(buffer, int, 64, 1); +extern ARRAY(buffer, uint, 8, 8); +extern ARRAY(buffer, uint, 16, 4); +extern ARRAY(buffer, uint, 32, 2); +extern ARRAY(buffer, uint, 64, 1); +extern ARRAY(buffer, float, 32, 2); +extern ARRAY(buffer, int, 8, 16); +extern ARRAY(buffer, int, 16, 8); +extern ARRAY(buffer, int, 32, 4); +extern ARRAY(buffer, int, 64, 2); +extern ARRAY(buffer, uint, 8, 16); +extern ARRAY(buffer, uint, 16, 8); +extern ARRAY(buffer, uint, 32, 4); +extern ARRAY(buffer, uint, 64, 2); +extern ARRAY(buffer, float, 32, 4); + +/* The tests for vld1_dup and vdup expect at least 4 entries in the + input buffer, so force 1- and 2-elements initializers to have 4 + entries. */ +extern ARRAY(buffer_dup, int, 8, 8); +extern ARRAY(buffer_dup, int, 16, 4); +extern ARRAY4(buffer_dup, int, 32, 2); +extern ARRAY4(buffer_dup, int, 64, 1); +extern ARRAY(buffer_dup, uint, 8, 8); +extern ARRAY(buffer_dup, uint, 16, 4); +extern ARRAY4(buffer_dup, uint, 32, 2); +extern ARRAY4(buffer_dup, uint, 64, 1); +extern ARRAY4(buffer_dup, float, 32, 2); +extern ARRAY(buffer_dup, int, 8, 16); +extern ARRAY(buffer_dup, int, 16, 8); +extern ARRAY(buffer_dup, int, 32, 4); +extern ARRAY4(buffer_dup, int, 64, 2); +extern ARRAY(buffer_dup, uint, 8, 16); +extern ARRAY(buffer_dup, uint, 16, 8); +extern ARRAY(buffer_dup, uint, 32, 4); +extern ARRAY4(buffer_dup, uint, 64, 2); +extern ARRAY(buffer_dup, float, 32, 4); + +/* Input buffers for vld2, one of each size */ +extern VECT_ARRAY(buffer_vld2, int, 8, 8, 2); +extern VECT_ARRAY(buffer_vld2, int, 16, 4, 2); +extern VECT_ARRAY(buffer_vld2, int, 32, 2, 2); +extern VECT_ARRAY(buffer_vld2, int, 64, 1, 2); +extern VECT_ARRAY(buffer_vld2, uint, 8, 8, 2); +extern VECT_ARRAY(buffer_vld2, uint, 16, 4, 2); +extern VECT_ARRAY(buffer_vld2, uint, 32, 2, 2); +extern VECT_ARRAY(buffer_vld2, uint, 64, 1, 2); +extern VECT_ARRAY(buffer_vld2, float, 32, 2, 2); +extern VECT_ARRAY(buffer_vld2, int, 8, 16, 2); +extern VECT_ARRAY(buffer_vld2, int, 16, 8, 2); +extern VECT_ARRAY(buffer_vld2, int, 32, 4, 2); +extern VECT_ARRAY(buffer_vld2, int, 64, 2, 2); +extern VECT_ARRAY(buffer_vld2, uint, 8, 16, 2); +extern VECT_ARRAY(buffer_vld2, uint, 16, 8, 2); +extern VECT_ARRAY(buffer_vld2, uint, 32, 4, 2); +extern VECT_ARRAY(buffer_vld2, uint, 64, 2, 2); +extern VECT_ARRAY(buffer_vld2, float, 32, 4, 2); + +/* Input buffers for vld3, one of each size */ +extern VECT_ARRAY(buffer_vld3, int, 8, 8, 3); +extern VECT_ARRAY(buffer_vld3, int, 16, 4, 3); +extern VECT_ARRAY(buffer_vld3, int, 32, 2, 3); +extern VECT_ARRAY(buffer_vld3, int, 64, 1, 3); +extern VECT_ARRAY(buffer_vld3, uint, 8, 8, 3); +extern VECT_ARRAY(buffer_vld3, uint, 16, 4, 3); +extern VECT_ARRAY(buffer_vld3, uint, 32, 2, 3); +extern VECT_ARRAY(buffer_vld3, uint, 64, 1, 3); +extern VECT_ARRAY(buffer_vld3, float, 32, 2, 3); +extern VECT_ARRAY(buffer_vld3, int, 8, 16, 3); +extern VECT_ARRAY(buffer_vld3, int, 16, 8, 3); +extern VECT_ARRAY(buffer_vld3, int, 32, 4, 3); +extern VECT_ARRAY(buffer_vld3, int, 64, 2, 3); +extern VECT_ARRAY(buffer_vld3, uint, 8, 16, 3); +extern VECT_ARRAY(buffer_vld3, uint, 16, 8, 3); +extern VECT_ARRAY(buffer_vld3, uint, 32, 4, 3); +extern VECT_ARRAY(buffer_vld3, uint, 64, 2, 3); +extern VECT_ARRAY(buffer_vld3, float, 32, 4, 3); + +/* Input buffers for vld4, one of each size */ +extern VECT_ARRAY(buffer_vld4, int, 8, 8, 4); +extern VECT_ARRAY(buffer_vld4, int, 16, 4, 4); +extern VECT_ARRAY(buffer_vld4, int, 32, 2, 4); +extern VECT_ARRAY(buffer_vld4, int, 64, 1, 4); +extern VECT_ARRAY(buffer_vld4, uint, 8, 8, 4); +extern VECT_ARRAY(buffer_vld4, uint, 16, 4, 4); +extern VECT_ARRAY(buffer_vld4, uint, 32, 2, 4); +extern VECT_ARRAY(buffer_vld4, uint, 64, 1, 4); +extern VECT_ARRAY(buffer_vld4, float, 32, 2, 4); +extern VECT_ARRAY(buffer_vld4, int, 8, 16, 4); +extern VECT_ARRAY(buffer_vld4, int, 16, 8, 4); +extern VECT_ARRAY(buffer_vld4, int, 32, 4, 4); +extern VECT_ARRAY(buffer_vld4, int, 64, 2, 4); +extern VECT_ARRAY(buffer_vld4, uint, 8, 16, 4); +extern VECT_ARRAY(buffer_vld4, uint, 16, 8, 4); +extern VECT_ARRAY(buffer_vld4, uint, 32, 4, 4); +extern VECT_ARRAY(buffer_vld4, uint, 64, 2, 4); +extern VECT_ARRAY(buffer_vld4, float, 32, 4, 4); + +/* Input buffers for vld2_lane */ +extern VECT_VAR_DECL(buffer_vld2_lane, int, 8, 2)[2]; +extern VECT_VAR_DECL(buffer_vld2_lane, int, 16, 2)[2]; +extern VECT_VAR_DECL(buffer_vld2_lane, int, 32, 2)[2]; +extern VECT_VAR_DECL(buffer_vld2_lane, int, 64, 2)[2]; +extern VECT_VAR_DECL(buffer_vld2_lane, uint, 8, 2)[2]; +extern VECT_VAR_DECL(buffer_vld2_lane, uint, 16, 2)[2]; +extern VECT_VAR_DECL(buffer_vld2_lane, uint, 32, 2)[2]; +extern VECT_VAR_DECL(buffer_vld2_lane, uint, 64, 2)[2]; +extern VECT_VAR_DECL(buffer_vld2_lane, float, 32, 2)[2]; + +/* Input buffers for vld3_lane */ +extern VECT_VAR_DECL(buffer_vld3_lane, int, 8, 3)[3]; +extern VECT_VAR_DECL(buffer_vld3_lane, int, 16, 3)[3]; +extern VECT_VAR_DECL(buffer_vld3_lane, int, 32, 3)[3]; +extern VECT_VAR_DECL(buffer_vld3_lane, int, 64, 3)[3]; +extern VECT_VAR_DECL(buffer_vld3_lane, uint, 8, 3)[3]; +extern VECT_VAR_DECL(buffer_vld3_lane, uint, 16, 3)[3]; +extern VECT_VAR_DECL(buffer_vld3_lane, uint, 32, 3)[3]; +extern VECT_VAR_DECL(buffer_vld3_lane, uint, 64, 3)[3]; +extern VECT_VAR_DECL(buffer_vld3_lane, float, 32, 3)[3]; + +/* Input buffers for vld4_lane */ +extern VECT_VAR_DECL(buffer_vld4_lane, int, 8, 4)[4]; +extern VECT_VAR_DECL(buffer_vld4_lane, int, 16, 4)[4]; +extern VECT_VAR_DECL(buffer_vld4_lane, int, 32, 4)[4]; +extern VECT_VAR_DECL(buffer_vld4_lane, int, 64, 4)[4]; +extern VECT_VAR_DECL(buffer_vld4_lane, uint, 8, 4)[4]; +extern VECT_VAR_DECL(buffer_vld4_lane, uint, 16, 4)[4]; +extern VECT_VAR_DECL(buffer_vld4_lane, uint, 32, 4)[4]; +extern VECT_VAR_DECL(buffer_vld4_lane, uint, 64, 4)[4]; +extern VECT_VAR_DECL(buffer_vld4_lane, float, 32, 4)[4]; + +/* Output buffers, one of each size */ static ARRAY(result, int, 8, 8); static ARRAY(result, int, 16, 4); static ARRAY(result, int, 32, 2); |