From c1cc7826d74587e0dc1c855810633a219b161ab3 Mon Sep 17 00:00:00 2001
From: Christophe Lyon <christophe.lyon@linaro.org>
Date: Tue, 20 Jan 2015 16:04:24 +0100
Subject: __set_neon_cumulative_sat() modifies the contents on the QC flag, and
 some intrinsics do so too: this patch adds the explicit dependency on the asm
 statement, to avoid code reordering or removal.

When writing QC, the asm statement now has a fake input dependency,
which is the output of the intrinsic being tested. Modifying the
__set_neon_cumulative_sat macro is necessary, to be able to accept all
the possible input types.

Update the generic code in ref_v_binary_sat_op.c and ref_v_unary_sat_op.c
accordingly, as well as all the tests involving QC.
---
 ref_v_binary_sat_op.c |  2 +-
 ref_v_unary_sat_op.c  |  2 +-
 ref_vqdmlal.c         |  2 +-
 ref_vqdmlal_lane.c    |  2 +-
 ref_vqdmlal_n.c       |  2 +-
 ref_vqdmulh.c         |  2 +-
 ref_vqdmulh_lane.c    |  2 +-
 ref_vqdmulh_n.c       |  2 +-
 ref_vqdmull.c         |  2 +-
 ref_vqdmull_lane.c    |  2 +-
 ref_vqdmull_n.c       |  2 +-
 ref_vqmovn.c          |  2 +-
 ref_vqmovun.c         |  2 +-
 ref_vqrdmulh.c        | 16 ++++++++--------
 ref_vqrdmulh_lane.c   |  2 +-
 ref_vqrdmulh_n.c      | 16 ++++++++--------
 ref_vqrshl.c          |  2 +-
 ref_vqrshrn_n.c       |  2 +-
 ref_vqrshrun_n.c      | 16 ++++++++--------
 ref_vqshl.c           | 18 +++++++++---------
 ref_vqshl_n.c         |  2 +-
 ref_vqshlu_n.c        | 18 +++++++++---------
 ref_vqshrn_n.c        |  2 +-
 ref_vqshrun_n.c       | 18 +++++++++---------
 stm-arm-neon-ref.h    | 37 +++++++++++++++++++++----------------
 25 files changed, 90 insertions(+), 85 deletions(-)

diff --git a/ref_v_binary_sat_op.c b/ref_v_binary_sat_op.c
index 71af870..532da16 100644
--- a/ref_v_binary_sat_op.c
+++ b/ref_v_binary_sat_op.c
@@ -41,7 +41,7 @@ FNNAME (INSN_NAME)
   /* vector_res = OP(vector1,vector2), then store the result.  */
 
 #define TEST_BINARY_SAT_OP1(INSN, Q, T1, T2, W, N)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##Q##_##T2##W(VECT_VAR(vector1, T1, W, N),		\
 		      VECT_VAR(vector2, T1, W, N));		\
diff --git a/ref_v_unary_sat_op.c b/ref_v_unary_sat_op.c
index 7e6673e..b9fea48 100644
--- a/ref_v_unary_sat_op.c
+++ b/ref_v_unary_sat_op.c
@@ -40,7 +40,7 @@ FNNAME (INSN_NAME)
 {
   /* Basic test: y=OP(x), then store the result.  */
 #define TEST_UNARY_SAT_OP1(INSN, Q, T1, T2, W, N)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N));		\
   vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),			\
diff --git a/ref_vqdmlal.c b/ref_vqdmlal.c
index d51d568..59c3672 100644
--- a/ref_vqdmlal.c
+++ b/ref_vqdmlal.c
@@ -45,7 +45,7 @@ FNNAME (INSN_NAME)
   /* vector_res = OP(vector, vector3, vector4),
      then store the result.  */
 #define TEST_VQDMLXL1(INSN, T1, T2, W, W2, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##_##T2##W2(VECT_VAR(vector, T1, W, N),			\
 		    VECT_VAR(vector3, T1, W2, N),		\
diff --git a/ref_vqdmlal_lane.c b/ref_vqdmlal_lane.c
index 53073d8..e7d42f7 100644
--- a/ref_vqdmlal_lane.c
+++ b/ref_vqdmlal_lane.c
@@ -44,7 +44,7 @@ FNNAME (INSN_NAME)
   /* vector_res = vqdmlxl_lane(vector, vector3, vector4, lane),
      then store the result.  */
 #define TEST_VQDMLXL_LANE1(INSN, T1, T2, W, W2, N, V)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##_##T2##W2(VECT_VAR(vector, T1, W, N),			\
 		    VECT_VAR(vector3, T1, W2, N),		\
diff --git a/ref_vqdmlal_n.c b/ref_vqdmlal_n.c
index 318a4ea..86f1e30 100644
--- a/ref_vqdmlal_n.c
+++ b/ref_vqdmlal_n.c
@@ -44,7 +44,7 @@ FNNAME (INSN_NAME)
   /* vector_res = vqdmlxl_n(vector, vector3, val),
      then store the result.  */
 #define TEST_VQDMLXL_N1(INSN, T1, T2, W, W2, N, V)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##_##T2##W2(VECT_VAR(vector, T1, W, N),			\
 		    VECT_VAR(vector3, T1, W2, N),		\
diff --git a/ref_vqdmulh.c b/ref_vqdmulh.c
index f78b649..59ff820 100644
--- a/ref_vqdmulh.c
+++ b/ref_vqdmulh.c
@@ -41,7 +41,7 @@ FNNAME (INSN)
 {
   /* vector_res = vqdmulh(vector,vector2,lane), then store the result.  */
 #define TEST_VQDMULH2(INSN, Q, T1, T2, W, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),		\
 		      VECT_VAR(vector2, T1, W, N));		\
diff --git a/ref_vqdmulh_lane.c b/ref_vqdmulh_lane.c
index 93db623..ae52667 100644
--- a/ref_vqdmulh_lane.c
+++ b/ref_vqdmulh_lane.c
@@ -40,7 +40,7 @@ FNNAME (INSN)
 {
   /* vector_res = vqdmulh_lane(vector,vector2,lane), then store the result.  */
 #define TEST_VQDMULH_LANE2(INSN, Q, T1, T2, W, N, N2, L)		\
-  Set_Neon_Cumulative_Sat(0);						\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));		\
   VECT_VAR(vector_res, T1, W, N) =					\
     INSN##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N),			\
 			   VECT_VAR(vector2, T1, W, N2),		\
diff --git a/ref_vqdmulh_n.c b/ref_vqdmulh_n.c
index 60716f7..376259e 100644
--- a/ref_vqdmulh_n.c
+++ b/ref_vqdmulh_n.c
@@ -42,7 +42,7 @@ FNNAME (INSN)
 
   /* vector_res = vqdmulh_n(vector,val), then store the result.  */
 #define TEST_VQDMULH_N2(INSN, Q, T1, T2, W, N, L)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##Q##_n_##T2##W(VECT_VAR(vector, T1, W, N),		\
 			L);					\
diff --git a/ref_vqdmull.c b/ref_vqdmull.c
index f97a6c8..478181d 100644
--- a/ref_vqdmull.c
+++ b/ref_vqdmull.c
@@ -40,7 +40,7 @@ FNNAME (INSN)
 {
   /* Basic test: y=vqdmull(x,x), then store the result.  */
 #define TEST_VQDMULL2(INSN, T1, T2, W, W2, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W2, N));	\
   VECT_VAR(vector_res, T1, W2, N) =				\
     INSN##_##T2##W(VECT_VAR(vector, T1, W, N),			\
 		   VECT_VAR(vector2, T1, W, N));		\
diff --git a/ref_vqdmull_lane.c b/ref_vqdmull_lane.c
index b2ee183..bf92c6b 100644
--- a/ref_vqdmull_lane.c
+++ b/ref_vqdmull_lane.c
@@ -42,7 +42,7 @@ FNNAME (INSN)
 
   /* vector_res = vqdmull_lane(vector,vector2,lane), then store the result.  */
 #define TEST_VQDMULL_LANE2(INSN, T1, T2, W, W2, N, L)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W2, N));	\
   VECT_VAR(vector_res, T1, W2, N) =				\
     INSN##_lane_##T2##W(VECT_VAR(vector, T1, W, N),		\
 			VECT_VAR(vector2, T1, W, N),		\
diff --git a/ref_vqdmull_n.c b/ref_vqdmull_n.c
index 92b1e48..7a482b2 100644
--- a/ref_vqdmull_n.c
+++ b/ref_vqdmull_n.c
@@ -42,7 +42,7 @@ FNNAME (INSN)
 
   /* vector_res = vqdmull_n(vector,val), then store the result.  */
 #define TEST_VQDMULL_N2(INSN, T1, T2, W, W2, N, L)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W2, N));	\
   VECT_VAR(vector_res, T1, W2, N) =				\
     INSN##_n_##T2##W(VECT_VAR(vector, T1, W, N),		\
 		     L);					\
diff --git a/ref_vqmovn.c b/ref_vqmovn.c
index 87e119c..0f7c933 100644
--- a/ref_vqmovn.c
+++ b/ref_vqmovn.c
@@ -41,7 +41,7 @@ FNNAME (INSN_NAME)
 {
   /* Basic test: y=OP(x), then store the result.  */
 #define TEST_UNARY_OP1(INSN, T1, T2, W, W2, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##_##T2##W2(VECT_VAR(vector, T1, W2, N));		\
   vst1##_##T2##W(VECT_VAR(result, T1, W, N),			\
diff --git a/ref_vqmovun.c b/ref_vqmovun.c
index a898d77..5582cea 100644
--- a/ref_vqmovun.c
+++ b/ref_vqmovun.c
@@ -41,7 +41,7 @@ FNNAME (INSN_NAME)
 {
   /* Basic test: y=OP(x), then store the result.  */
 #define TEST_UNARY_OP1(INSN, T1, T2, W, W2, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##_s##W2(VECT_VAR(vector, int, W2, N));			\
   vst1##_##T2##W(VECT_VAR(result, T1, W, N),			\
diff --git a/ref_vqrdmulh.c b/ref_vqrdmulh.c
index 37193b7..f499b47 100644
--- a/ref_vqrdmulh.c
+++ b/ref_vqrdmulh.c
@@ -40,14 +40,14 @@ THE SOFTWARE.
 FNNAME (INSN)
 {
   /* vector_res = vqrdmulh(vector,vector2), then store the result.  */
-#define TEST_VQRDMULH2(INSN, Q, T1, T2, W, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
-  VECT_VAR(vector_res, T1, W, N) =				\
-    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),		\
-		      VECT_VAR(vector2, T1, W, N));		\
-  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),			\
-		    VECT_VAR(vector_res, T1, W, N));		\
-  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_##T2##W),	\
+#define TEST_VQRDMULH2(INSN, Q, T1, T2, W, N)				\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));		\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),			\
+		      VECT_VAR(vector2, T1, W, N));			\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		    VECT_VAR(vector_res, T1, W, N));			\
+  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_##T2##W),		\
 			   xSTR(T1), W, N)
 
   /* Two auxliary macros are necessary to expand INSN */
diff --git a/ref_vqrdmulh_lane.c b/ref_vqrdmulh_lane.c
index 807f5c2..b2c37db 100644
--- a/ref_vqrdmulh_lane.c
+++ b/ref_vqrdmulh_lane.c
@@ -41,7 +41,7 @@ FNNAME (INSN)
 {
   /* vector_res = vqrdmulh_lane(vector,vector2,lane), then store the result.  */
 #define TEST_VQRDMULH_LANE2(INSN, Q, T1, T2, W, N, N2, L)		\
-  Set_Neon_Cumulative_Sat(0);						\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));		\
   VECT_VAR(vector_res, T1, W, N) =					\
     INSN##Q##_lane_##T2##W(VECT_VAR(vector, T1, W, N),			\
 			   VECT_VAR(vector2, T1, W, N2),		\
diff --git a/ref_vqrdmulh_n.c b/ref_vqrdmulh_n.c
index 92b79b6..01f0e0b 100644
--- a/ref_vqrdmulh_n.c
+++ b/ref_vqrdmulh_n.c
@@ -42,14 +42,14 @@ FNNAME (INSN)
   int i;
 
   /* vector_res = vqrdmulh_n(vector,val), then store the result.  */
-#define TEST_VQRDMULH_N2(INSN, Q, T1, T2, W, N, L)		\
-  Set_Neon_Cumulative_Sat(0);					\
-  VECT_VAR(vector_res, T1, W, N) =				\
-    INSN##Q##_n_##T2##W(VECT_VAR(vector, T1, W, N),		\
-			L);					\
-  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),			\
-		    VECT_VAR(vector_res, T1, W, N));		\
-  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_n_##T2##W),	\
+#define TEST_VQRDMULH_N2(INSN, Q, T1, T2, W, N, L)			\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));		\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    INSN##Q##_n_##T2##W(VECT_VAR(vector, T1, W, N),			\
+			L);						\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		    VECT_VAR(vector_res, T1, W, N));			\
+  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_n_##T2##W),		\
 			   xSTR(T1), W, N)
 
   /* Two auxliary macros are necessary to expand INSN */
diff --git a/ref_vqrshl.c b/ref_vqrshl.c
index 5028bf9..e4a33e5 100644
--- a/ref_vqrshl.c
+++ b/ref_vqrshl.c
@@ -41,7 +41,7 @@ FNNAME (INSN)
 {
   /* Basic test: v3=vqrshl(v1,v2), then store the result.  */
 #define TEST_VQRSHL2(INSN, T3, Q, T1, T2, W, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),		\
 		      VECT_VAR(vector_shift, T3, W, N));	\
diff --git a/ref_vqrshrn_n.c b/ref_vqrshrn_n.c
index 2126d3a..34bf082 100644
--- a/ref_vqrshrn_n.c
+++ b/ref_vqrshrn_n.c
@@ -41,7 +41,7 @@ FNNAME (INSN)
 {
   /* Basic test: y=vqrshrn_n(x,v), then store the result.  */
 #define TEST_VQRSHRN_N2(INSN, T1, T2, W, W2, N, V)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W2, N));	\
   VECT_VAR(vector_res, T1, W2, N) =				\
     INSN##_##T2##W(VECT_VAR(vector, T1, W, N),			\
 		   V);						\
diff --git a/ref_vqrshrun_n.c b/ref_vqrshrun_n.c
index 3ef1322..53d11f5 100644
--- a/ref_vqrshrun_n.c
+++ b/ref_vqrshrun_n.c
@@ -40,14 +40,14 @@ THE SOFTWARE.
 FNNAME (INSN)
 {
   /* Basic test: y=vqrshrun_n(x,v), then store the result.  */
-#define TEST_VQRSHRUN_N2(INSN, T1, T2, W, W2, N, V)		\
-  Set_Neon_Cumulative_Sat(0);					\
-  VECT_VAR(vector_res, uint, W2, N) =				\
-    INSN##_##T2##W(VECT_VAR(vector, T1, W, N),			\
-		   V);						\
-  vst1_u##W2(VECT_VAR(result, uint, W2, N),			\
-	     VECT_VAR(vector_res, uint, W2, N));		\
-  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##_##T2##W),	\
+#define TEST_VQRSHRUN_N2(INSN, T1, T2, W, W2, N, V)			\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, uint, W2, N));	\
+  VECT_VAR(vector_res, uint, W2, N) =					\
+    INSN##_##T2##W(VECT_VAR(vector, T1, W, N),				\
+		   V);							\
+  vst1_u##W2(VECT_VAR(result, uint, W2, N),				\
+	     VECT_VAR(vector_res, uint, W2, N));			\
+  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##_##T2##W),		\
 			   xSTR(T1), W, N)
 
   /* Two auxliary macros are necessary to expand INSN */
diff --git a/ref_vqshl.c b/ref_vqshl.c
index 84ca9a0..a9d29d7 100644
--- a/ref_vqshl.c
+++ b/ref_vqshl.c
@@ -40,15 +40,15 @@ THE SOFTWARE.
 FNNAME (INSN)
 {
   /* Basic test: v3=vqshl(v1,v2), then store the result.  */
-#define TEST_VQSHL2(INSN, T3, Q, T1, T2, W, N)			\
-  Set_Neon_Cumulative_Sat(0);					\
-  VECT_VAR(vector_res, T1, W, N) =				\
-    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),		\
-		      VECT_VAR(vector_shift, T3, W, N));	\
-  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),			\
-		    VECT_VAR(vector_res, T1, W, N));		\
-  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_##T2##W),	\
-			   xSTR(T1), W, N)
+#define TEST_VQSHL2(INSN, T3, Q, T1, T2, W, N)				\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));		\
+  VECT_VAR(vector_res, T1, W, N) =					\
+    INSN##Q##_##T2##W(VECT_VAR(vector, T1, W, N),			\
+		      VECT_VAR(vector_shift, T3, W, N));		\
+  vst1##Q##_##T2##W(VECT_VAR(result, T1, W, N),				\
+		      VECT_VAR(vector_res, T1, W, N));			\
+  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_##T2##W),		\
+			       xSTR(T1), W, N)
 
   /* Two auxliary macros are necessary to expand INSN */
 #define TEST_VQSHL1(INSN, T3, Q, T1, T2, W, N)	\
diff --git a/ref_vqshl_n.c b/ref_vqshl_n.c
index 263e661..3ee26b5 100644
--- a/ref_vqshl_n.c
+++ b/ref_vqshl_n.c
@@ -41,7 +41,7 @@ FNNAME (INSN)
 {
   /* Basic test: v2=vqshl_n(v1,v), then store the result.  */
 #define TEST_VQSHL_N2(INSN, Q, T1, T2, W, N, V)			\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W, N));	\
   VECT_VAR(vector_res, T1, W, N) =				\
     INSN##Q##_n_##T2##W(VECT_VAR(vector, T1, W, N),		\
 			V);					\
diff --git a/ref_vqshlu_n.c b/ref_vqshlu_n.c
index b72261c..27d53de 100644
--- a/ref_vqshlu_n.c
+++ b/ref_vqshlu_n.c
@@ -40,15 +40,15 @@ THE SOFTWARE.
 FNNAME (INSN)
 {
   /* Basic test: v2=vqshlu_n(v1,v), then store the result.  */
-#define TEST_VQSHLU_N2(INSN, Q, T1, T2, T3, T4, W, N, V)	\
-  Set_Neon_Cumulative_Sat(0);					\
-  VECT_VAR(vector_res, T3, W, N) =				\
-    INSN##Q##_n_##T2##W(VECT_VAR(vector, T1, W, N),		\
-			V);					\
-  vst1##Q##_##T4##W(VECT_VAR(result, T3, W, N),			\
-		    VECT_VAR(vector_res, T3, W, N));		\
-  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_n_##T2##W),	\
-			   xSTR(T1), W, N)
+#define TEST_VQSHLU_N2(INSN, Q, T1, T2, T3, T4, W, N, V)		\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T3, W, N));		\
+  VECT_VAR(vector_res, T3, W, N) =					\
+    INSN##Q##_n_##T2##W(VECT_VAR(vector, T1, W, N),			\
+			V);						\
+    vst1##Q##_##T4##W(VECT_VAR(result, T3, W, N),			\
+		      VECT_VAR(vector_res, T3, W, N));			\
+    dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##Q##_n_##T2##W),	\
+			       xSTR(T1), W, N)
 
   /* Two auxliary macros are necessary to expand INSN */
 #define TEST_VQSHLU_N1(INSN, Q, T1, T2, T3, T4, W, N, V)	\
diff --git a/ref_vqshrn_n.c b/ref_vqshrn_n.c
index 9e61ccb..96b8d61 100644
--- a/ref_vqshrn_n.c
+++ b/ref_vqshrn_n.c
@@ -41,7 +41,7 @@ FNNAME (INSN)
 {
   /* Basic test: y=vqshrn_n(x,v), then store the result.  */
 #define TEST_VQSHRN_N2(INSN, T1, T2, W, W2, N, V)		\
-  Set_Neon_Cumulative_Sat(0);					\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, T1, W2, N));	\
   VECT_VAR(vector_res, T1, W2, N) =				\
     INSN##_##T2##W(VECT_VAR(vector, T1, W, N),			\
 		   V);						\
diff --git a/ref_vqshrun_n.c b/ref_vqshrun_n.c
index c2a2e15..871da96 100644
--- a/ref_vqshrun_n.c
+++ b/ref_vqshrun_n.c
@@ -40,15 +40,15 @@ THE SOFTWARE.
 FNNAME (INSN)
 {
   /* Basic test: y=vqshrun_n(x,v), then store the result.  */
-#define TEST_VQSHRUN_N2(INSN, T1, T2, W, W2, N, V)		\
-  Set_Neon_Cumulative_Sat(0);					\
-  VECT_VAR(vector_res, uint, W2, N) =				\
-    INSN##_##T2##W(VECT_VAR(vector, T1, W, N),			\
-		   V);						\
-  vst1_u##W2(VECT_VAR(result, uint, W2, N),			\
-	     VECT_VAR(vector_res, uint, W2, N));		\
-  dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##_##T2##W),	\
-			   xSTR(T1), W, N)
+#define TEST_VQSHRUN_N2(INSN, T1, T2, W, W2, N, V)			\
+  Set_Neon_Cumulative_Sat(0, VECT_VAR(vector_res, uint, W2, N));	\
+  VECT_VAR(vector_res, uint, W2, N) =					\
+    INSN##_##T2##W(VECT_VAR(vector, T1, W, N),				\
+		   V);							\
+    vst1_u##W2(VECT_VAR(result, uint, W2, N),				\
+	       VECT_VAR(vector_res, uint, W2, N));			\
+      dump_neon_cumulative_sat(TEST_MSG, xSTR(INSN##_##T2##W),		\
+			       xSTR(T1), W, N)
 
   /* Two auxliary macros are necessary to expand INSN */
 #define TEST_VQSHRUN_N1(INSN, T1, T2, W, W2, N, V)	\
diff --git a/stm-arm-neon-ref.h b/stm-arm-neon-ref.h
index 33677fd..f7c7cc6 100644
--- a/stm-arm-neon-ref.h
+++ b/stm-arm-neon-ref.h
@@ -576,11 +576,15 @@ typedef union {
 #ifdef __ARMCC_VERSION
 register _ARM_FPSCR _afpscr_for_qc __asm("fpscr");
 # define Neon_Cumulative_Sat _afpscr_for_qc.b.QC
-# define Set_Neon_Cumulative_Sat(x)  {Neon_Cumulative_Sat = (x);}
+# define Set_Neon_Cumulative_Sat(x, depend)  {Neon_Cumulative_Sat = (x);}
 #else
 /* GCC/ARM does not know this register */
 # define Neon_Cumulative_Sat  __read_neon_cumulative_sat()
-# define Set_Neon_Cumulative_Sat(x)  __set_neon_cumulative_sat((x))
+/* We need a fake dependency to ensure correct ordering of asm
+   statements to preset the QC flag value, and Neon operators writing
+   to QC. */
+#define Set_Neon_Cumulative_Sat(x, depend)	\
+  __set_neon_cumulative_sat((x), (depend))
 
 # if defined(__aarch64__)
 static volatile int __read_neon_cumulative_sat (void) {
@@ -588,13 +592,14 @@ static volatile int __read_neon_cumulative_sat (void) {
     asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
     return _afpscr_for_qc.b.QC;
 }
-static void __set_neon_cumulative_sat (int x) {
-    _ARM_FPSCR _afpscr_for_qc;
-    asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));
-    _afpscr_for_qc.b.QC = x;
-    asm volatile ("msr fpsr,%0" : : "r" (_afpscr_for_qc));
-    return;
-}
+
+#define __set_neon_cumulative_sat(x, depend) {				\
+    _ARM_FPSCR _afpscr_for_qc;						\
+    asm volatile ("mrs %0,fpsr" : "=r" (_afpscr_for_qc));		\
+    _afpscr_for_qc.b.QC = x;						\
+    asm volatile ("msr fpsr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \
+  }
+
 # else
 static volatile int __read_neon_cumulative_sat (void) {
     _ARM_FPSCR _afpscr_for_qc;
@@ -602,13 +607,13 @@ static volatile int __read_neon_cumulative_sat (void) {
     return _afpscr_for_qc.b.QC;
 }
 
-static void __set_neon_cumulative_sat (int x) {
-    _ARM_FPSCR _afpscr_for_qc;
-    asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc));
-    _afpscr_for_qc.b.QC = x;
-    asm volatile ("vmsr fpscr,%0" : : "r" (_afpscr_for_qc));
-    return;
-}
+#define __set_neon_cumulative_sat(x, depend) {				\
+    _ARM_FPSCR _afpscr_for_qc;						\
+    asm volatile ("vmrs %0,fpscr" : "=r" (_afpscr_for_qc));		\
+    _afpscr_for_qc.b.QC = x;						\
+    asm volatile ("vmsr fpscr,%1" : "=X" (depend) : "r" (_afpscr_for_qc)); \
+  }
+
 # endif
 #endif
 
-- 
cgit v1.2.3