From 85f026f337499654a77d32ca68929fd5d456000e Mon Sep 17 00:00:00 2001
From: Anton Konovalov <anton.konovalov@intel.com>
Date: Sun, 18 May 2014 20:01:49 +0700
Subject: [4.6,4.8,4.9] Porting ARM NEON intrinsics to SSE x86

Added arm_neon.h to gcc 4.6, 4.8 and 4.9 toolchains. Updated config.gcc files.

Change-Id: If9c12c3e31f5256b178816dffb41c86af0912db0
Signed-off-by: Anton Konovalov <anton.konovalov@intel.com>
Signed-off-by: Pavel Chupin <pavel.v.chupin@intel.com>
---
 gcc-4.6/gcc/config.gcc             |    4 +-
 gcc-4.6/gcc/config/i386/arm_neon.h | 8641 ++++++++++++++++++++++++++++++++++++
 2 files changed, 8643 insertions(+), 2 deletions(-)
 create mode 100644 gcc-4.6/gcc/config/i386/arm_neon.h

(limited to 'gcc-4.6')

diff --git a/gcc-4.6/gcc/config.gcc b/gcc-4.6/gcc/config.gcc
index 7a2b9e4b1..892213df6 100644
--- a/gcc-4.6/gcc/config.gcc
+++ b/gcc-4.6/gcc/config.gcc
@@ -345,7 +345,7 @@ i[34567]86-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       abmintrin.h bmiintrin.h tbmintrin.h"
+		       abmintrin.h bmiintrin.h tbmintrin.h arm_neon.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -357,7 +357,7 @@ x86_64-*-*)
 		       nmmintrin.h bmmintrin.h fma4intrin.h wmmintrin.h
 		       immintrin.h x86intrin.h avxintrin.h xopintrin.h
 		       ia32intrin.h cross-stdarg.h lwpintrin.h popcntintrin.h
-		       abmintrin.h bmiintrin.h tbmintrin.h"
+		       abmintrin.h bmiintrin.h tbmintrin.h arm_neon.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
diff --git a/gcc-4.6/gcc/config/i386/arm_neon.h b/gcc-4.6/gcc/config/i386/arm_neon.h
new file mode 100644
index 000000000..5f56cbdae
--- /dev/null
+++ b/gcc-4.6/gcc/config/i386/arm_neon.h
@@ -0,0 +1,8641 @@
+//created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  victoria.zhislina@intel.com
+
+//*** Copyright (C) 2012-2014 Intel Corporation.  All rights reserved.
+
+//IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
+
+//By downloading, copying, installing or using the software you agree to this license.
+//If you do not agree to this license, do not download, install, copy or use the software.
+
+//                              License Agreement
+
+//Permission to use, copy, modify, and/or distribute this software for any
+//purpose with or without fee is hereby granted, provided that the above
+//copyright notice and this permission notice appear in all copies.
+
+//THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
+//REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
+//AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
+//INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
+//LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
+//OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+//PERFORMANCE OF THIS SOFTWARE.
+
+//*****************************************************************************************
+// This file is intended to simplify ARM->IA32 porting
+// It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
+// and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
+// MMX instruction set is not used due to performance overhead and the necessity to use the
+// EMMS instruction (_mm_empty())for mmx-x87 floating point switching
+//*****************************************************************************************
+
+//!!!!!!!  To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
+//!!!!!!!  Please pay attention at #define USE_SSSE3 and USE_SSE4 below - you need to define them for newest Intel platforms for
+//!!!!!!!  greater performance. It can be done by -mssse3 or -msse4.2 (which also implies -mssse3) compiler switch.
+
+#ifndef NEON2SSE_H
+#define NEON2SSE_H
+
+#ifndef USE_SSE4
+    #if defined(__SSE4_2__)
+        #define USE_SSE4
+        #define USE_SSSE3
+    #endif
+#endif
+
+#ifndef USE_SSSE3
+    #if defined(__SSSE3__)
+        #define USE_SSSE3
+    #endif
+#endif
+
+#include <xmmintrin.h>     //SSE
+#include <emmintrin.h>     //SSE2
+#include <pmmintrin.h>     //SSE3
+
+#ifdef USE_SSSE3
+    #include <tmmintrin.h>     //SSSE3
+#else
+# warning "Some functions require SSSE3 or higher."
+#endif
+
+#ifdef USE_SSE4
+    #include <smmintrin.h>     //SSE4.1
+    #include <nmmintrin.h>     //SSE4.2
+#endif
+
+/*********************************************************************************************************************/
+//    data types conversion
+/*********************************************************************************************************************/
+
+typedef __m128 float32x4_t;
+
+typedef __m128 float16x8_t;         //not supported by IA, for compartibility
+
+typedef __m128i int8x16_t;
+typedef __m128i int16x8_t;
+typedef __m128i int32x4_t;
+typedef __m128i int64x2_t;
+typedef __m128i uint8x16_t;
+typedef __m128i uint16x8_t;
+typedef __m128i uint32x4_t;
+typedef __m128i uint64x2_t;
+typedef __m128i poly8x16_t;
+typedef __m128i poly16x8_t;
+
+#if defined(_MSC_VER) && (_MSC_VER < 1300)
+    typedef signed char int8_t;
+    typedef unsigned char uint8_t;
+    typedef signed short int16_t;
+    typedef unsigned short uint16_t;
+    typedef signed int int32_t;
+    typedef unsigned int uint32_t;
+    typedef signed long long int64_t;
+    typedef unsigned long long uint64_t;
+#elif defined(_MSC_VER)
+    typedef signed __int8 int8_t;
+    typedef unsigned __int8 uint8_t;
+    typedef signed __int16 int16_t;
+    typedef unsigned __int16 uint16_t;
+    typedef signed __int32 int32_t;
+    typedef unsigned __int32 uint32_t;
+
+typedef signed long long int64_t;
+typedef unsigned long long uint64_t;
+#else
+    #include <stdint.h>
+    #include <limits.h>
+#endif
+#if defined(_MSC_VER)
+#define SINT_MIN     (-2147483647 - 1)    /* min signed int value */
+#define SINT_MAX       2147483647         /* max signed int value */
+#else
+#define SINT_MIN     INT_MIN              /* min signed int value */
+#define SINT_MAX     INT_MAX              /* max signed int value */
+#endif
+
+typedef   float float32_t;
+typedef   float __fp16;
+
+typedef  uint8_t poly8_t;
+typedef  uint16_t poly16_t;
+
+//MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
+//error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
+
+//Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
+struct int8x16x2_t {
+    int8x16_t val[2];
+};
+struct int16x8x2_t {
+    int16x8_t val[2];
+};
+struct int32x4x2_t {
+    int32x4_t val[2];
+};
+struct int64x2x2_t {
+    int64x2_t val[2];
+};
+
+typedef struct int8x16x2_t int8x16x2_t;         //for C compilers to make them happy
+typedef struct int16x8x2_t int16x8x2_t;         //for C compilers to make them happy
+typedef struct int32x4x2_t int32x4x2_t;         //for C compilers to make them happy
+typedef struct int64x2x2_t int64x2x2_t;         //for C compilers to make them happy
+//to avoid pointers conversion
+typedef  int8x16x2_t int8x8x2_t;
+typedef  int16x8x2_t int16x4x2_t;
+typedef  int32x4x2_t int32x2x2_t;
+typedef  int64x2x2_t int64x1x2_t;
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
+typedef struct int8x16x2_t uint8x16x2_t;
+typedef struct int16x8x2_t uint16x8x2_t;
+typedef struct int32x4x2_t uint32x4x2_t;
+typedef struct int64x2x2_t uint64x2x2_t;
+typedef struct int8x16x2_t poly8x16x2_t;
+typedef struct int16x8x2_t poly16x8x2_t;
+
+typedef  int8x8x2_t uint8x8x2_t;
+typedef  int16x4x2_t uint16x4x2_t;
+typedef  int32x2x2_t uint32x2x2_t;
+typedef  int64x1x2_t uint64x1x2_t;
+typedef  int8x8x2_t poly8x8x2_t;
+typedef  int16x4x2_t poly16x4x2_t;
+
+//float
+struct float32x4x2_t {
+    float32x4_t val[2];
+};
+struct float16x8x2_t {
+    float16x8_t val[2];
+};
+typedef struct float32x4x2_t float32x4x2_t;         //for C compilers to make them happy
+typedef struct float16x8x2_t float16x8x2_t;         //for C compilers to make them happy
+typedef  float32x4x2_t float32x2x2_t;
+typedef  float16x8x2_t float16x4x2_t;
+
+//4
+struct int8x16x4_t {
+    int8x16_t val[4];
+};
+struct int16x8x4_t {
+    int16x8_t val[4];
+};
+struct int32x4x4_t {
+    int32x4_t val[4];
+};
+struct int64x2x4_t {
+    int64x2_t val[4];
+};
+
+typedef struct int8x16x4_t int8x16x4_t;         //for C compilers to make them happy
+typedef struct int16x8x4_t int16x8x4_t;         //for C compilers to make them happy
+typedef struct int32x4x4_t int32x4x4_t;         //for C compilers to make them happy
+typedef struct int64x2x4_t int64x2x4_t;         //for C compilers to make them happy
+typedef  int8x16x4_t int8x8x4_t;
+typedef  int16x8x4_t int16x4x4_t;
+typedef  int32x4x4_t int32x2x4_t;
+typedef  int64x2x4_t int64x1x4_t;
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
+typedef int8x8x4_t uint8x8x4_t;
+typedef int16x4x4_t uint16x4x4_t;
+typedef int32x2x4_t uint32x2x4_t;
+typedef int64x1x4_t uint64x1x4_t;
+typedef uint8x8x4_t poly8x8x4_t;
+typedef uint16x4x4_t poly16x4x4_t;
+
+typedef struct int8x16x4_t uint8x16x4_t;
+typedef struct int16x8x4_t uint16x8x4_t;
+typedef struct int32x4x4_t uint32x4x4_t;
+typedef struct int64x2x4_t uint64x2x4_t;
+typedef struct int8x16x4_t poly8x16x4_t;
+typedef struct int16x8x4_t poly16x8x4_t;
+
+struct float32x4x4_t {
+    float32x4_t val[4];
+};
+struct float16x8x4_t {
+    float16x8_t val[4];
+};
+
+typedef struct float32x4x4_t float32x4x4_t;         //for C compilers to make them happy
+typedef struct float16x8x4_t float16x8x4_t;         //for C compilers to make them happy
+typedef  float32x4x4_t float32x2x4_t;
+typedef  float16x8x4_t float16x4x4_t;
+
+//3
+struct int16x8x3_t {
+    int16x8_t val[3];
+};
+struct int32x4x3_t {
+    int32x4_t val[3];
+};
+struct int64x2x3_t {
+    int64x2_t val[3];
+};
+struct int8x16x3_t {
+    int8x16_t val[3];
+};
+
+typedef struct int16x8x3_t int16x8x3_t;         //for C compilers to make them happy
+typedef struct int32x4x3_t int32x4x3_t;         //for C compilers to make them happy
+typedef struct int64x2x3_t int64x2x3_t;         //for C compilers to make them happy
+typedef struct int8x16x3_t int8x16x3_t;         //for C compilers to make them happy
+typedef  int16x8x3_t int16x4x3_t;
+typedef  int32x4x3_t int32x2x3_t;
+typedef  int64x2x3_t int64x1x3_t;
+typedef  int8x16x3_t int8x8x3_t;
+
+/* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
+typedef struct int8x16x3_t uint8x16x3_t;
+typedef struct int16x8x3_t uint16x8x3_t;
+typedef struct int32x4x3_t uint32x4x3_t;
+typedef struct int64x2x3_t uint64x2x3_t;
+typedef struct int8x16x3_t poly8x16x3_t;
+typedef struct int16x8x3_t poly16x8x3_t;
+typedef int8x8x3_t uint8x8x3_t;
+typedef int16x4x3_t uint16x4x3_t;
+typedef int32x2x3_t uint32x2x3_t;
+typedef int64x1x3_t uint64x1x3_t;
+typedef int8x8x3_t poly8x8x3_t;
+typedef int16x4x3_t poly16x4x3_t;
+
+//float
+struct float32x4x3_t {
+    float32x4_t val[3];
+};
+struct float16x8x3_t {
+    float16x8_t val[3];
+};
+
+typedef struct float32x4x3_t float32x4x3_t;         //for C compilers to make them happy
+typedef struct float16x8x3_t float16x8x3_t;         //for C compilers to make them happy
+typedef  float32x4x3_t float32x2x3_t;
+typedef  float16x8x3_t float16x4x3_t;
+
+//****************************************************************************
+//****** Porting auxiliary macros ********************************************
+#define _M128i(a) (*(__m128i*)&(a))
+#define _M128d(a) (*(__m128d*)&(a))
+#define _M128(a) (*(__m128*)&(a))
+#define _Ui64(a) (*(uint64_t*)&(a))
+#define _UNSIGNED_T(a) u##a
+
+#define _SIGNBIT64 ((uint64_t)1 << 63)
+#define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
+#define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
+
+#define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
+#define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
+
+//***************  functions attributes  ********************************************
+//***********************************************************************************
+#ifdef __GNUC__
+    #define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+    #define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
+    #define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    #if _GCC_VERSION <  40500
+        #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
+    #else
+        #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
+    #endif
+#elif defined(_MSC_VER)|| defined (__INTEL_COMPILER)
+    #define _NEON2SSE_ALIGN_16  __declspec(align(16))
+    #define _NEON2SSE_INLINE __inline
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
+#else
+    #define _NEON2SSE_ALIGN_16  __declspec(align(16))
+    #define _NEON2SSE_INLINE inline
+    #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
+#endif
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+#define __constrange(min,max)  const
+#define __transfersize(size)
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+//*************************************************************************
+//*************************************************************************
+//*********  Functions declarations as declared in original arm_neon.h *****
+//*************************************************************************
+//Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
+
+int8x16_t vaddq_s8(int8x16_t a, int8x16_t b);         // VADD.I8 q0,q0,q0
+int16x8_t vaddq_s16(int16x8_t a, int16x8_t b);         // VADD.I16 q0,q0,q0
+int32x4_t vaddq_s32(int32x4_t a, int32x4_t b);         // VADD.I32 q0,q0,q0
+int64x2_t vaddq_s64(int64x2_t a, int64x2_t b);         // VADD.I64 q0,q0,q0
+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b);         // VADD.F32 q0,q0,q0
+uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b);         // VADD.I8 q0,q0,q0
+uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b);         // VADD.I16 q0,q0,q0
+uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b);         // VADD.I32 q0,q0,q0
+uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b);         // VADD.I64 q0,q0,q0
+//Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+
+//Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
+
+//Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
+
+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b);         // VHADD.S8 q0,q0,q0
+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b);         // VHADD.S16 q0,q0,q0
+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b);         // VHADD.S32 q0,q0,q0
+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b);         // VHADD.U8 q0,q0,q0
+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b);         // VHADD.U16 q0,q0,q0
+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b);         // VHADD.U32 q0,q0,q0
+//Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
+
+int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b);         // VRHADD.S8 q0,q0,q0
+int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b);         // VRHADD.S16 q0,q0,q0
+int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b);         // VRHADD.S32 q0,q0,q0
+uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b);         // VRHADD.U8 q0,q0,q0
+uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b);         // VRHADD.U16 q0,q0,q0
+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b);         // VRHADD.U32 q0,q0,q0
+//Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
+
+int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b);         // VQADD.S8 q0,q0,q0
+int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b);         // VQADD.S16 q0,q0,q0
+int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b);         // VQADD.S32 q0,q0,q0
+int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b);         // VQADD.S64 q0,q0,q0
+uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b);         // VQADD.U8 q0,q0,q0
+uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b);         // VQADD.U16 q0,q0,q0
+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b);         // VQADD.U32 q0,q0,q0
+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
+//Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
+
+//Vector rounding add high half: vraddhn
+
+//Multiplication
+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
+
+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b);         // VMUL.I8 q0,q0,q0
+int16x8_t vmulq_s16(int16x8_t a, int16x8_t b);         // VMUL.I16 q0,q0,q0
+int32x4_t vmulq_s32(int32x4_t a, int32x4_t b);         // VMUL.I32 q0,q0,q0
+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b);         // VMUL.F32 q0,q0,q0
+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b);         // VMUL.I8 q0,q0,q0
+uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b);         // VMUL.I16 q0,q0,q0
+uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b);         // VMUL.I32 q0,q0,q0
+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b);         // VMUL.P8 q0,q0,q0
+//Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
+
+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLA.I8 q0,q0,q0
+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLA.I16 q0,q0,q0
+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLA.I32 q0,q0,q0
+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLA.F32 q0,q0,q0
+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLA.I8 q0,q0,q0
+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLA.I16 q0,q0,q0
+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLA.I32 q0,q0,q0
+//Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
+
+//Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
+
+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLS.I8 q0,q0,q0
+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLS.I16 q0,q0,q0
+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLS.I32 q0,q0,q0
+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLS.F32 q0,q0,q0
+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLS.I8 q0,q0,q0
+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLS.I16 q0,q0,q0
+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLS.I32 q0,q0,q0
+//Vector multiply subtract long
+
+//Vector saturating doubling multiply high
+
+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b);         // VQDMULH.S16 q0,q0,q0
+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b);         // VQDMULH.S32 q0,q0,q0
+//Vector saturating rounding doubling multiply high
+
+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b);         // VQRDMULH.S16 q0,q0,q0
+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b);         // VQRDMULH.S32 q0,q0,q0
+//Vector saturating doubling multiply accumulate long
+
+//Vector saturating doubling multiply subtract long
+
+//Vector long multiply
+
+//Vector saturating doubling long multiply
+
+//Subtraction
+//Vector subtract
+
+int8x16_t vsubq_s8(int8x16_t a, int8x16_t b);         // VSUB.I8 q0,q0,q0
+int16x8_t vsubq_s16(int16x8_t a, int16x8_t b);         // VSUB.I16 q0,q0,q0
+int32x4_t vsubq_s32(int32x4_t a, int32x4_t b);         // VSUB.I32 q0,q0,q0
+int64x2_t vsubq_s64(int64x2_t a, int64x2_t b);         // VSUB.I64 q0,q0,q0
+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b);         // VSUB.F32 q0,q0,q0
+uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b);         // VSUB.I8 q0,q0,q0
+uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b);         // VSUB.I16 q0,q0,q0
+uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b);         // VSUB.I32 q0,q0,q0
+uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b);         // VSUB.I64 q0,q0,q0
+//Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
+
+//Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
+
+//Vector saturating subtract
+
+int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b);         // VQSUB.S8 q0,q0,q0
+int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b);         // VQSUB.S16 q0,q0,q0
+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b);         // VQSUB.S32 q0,q0,q0
+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b);         // VQSUB.S64 q0,q0,q0
+uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b);         // VQSUB.U8 q0,q0,q0
+uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b);         // VQSUB.U16 q0,q0,q0
+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b);         // VQSUB.U32 q0,q0,q0
+uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b);         // VQSUB.U64 q0,q0,q0
+//Vector halving subtract
+
+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b);         // VHSUB.S8 q0,q0,q0
+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b);         // VHSUB.S16 q0,q0,q0
+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b);         // VHSUB.S32 q0,q0,q0
+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b);         // VHSUB.U8 q0,q0,q0
+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b);         // VHSUB.U16 q0,q0,q0
+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b);         // VHSUB.U32 q0,q0,q0
+//Vector subtract high half
+
+//Vector rounding subtract high half
+
+//Comparison
+//Vector compare equal
+
+uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b);         // VCEQ.I16 q0, q0, q0
+uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b);         // VCEQ.I32 q0, q0, q0
+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b);         // VCEQ.F32 q0, q0, q0
+uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b);         // VCEQ.I8 q0, q0, q0
+uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b);         // VCEQ.I16 q0, q0, q0
+uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b);         // VCEQ.I32 q0, q0, q0
+uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
+//Vector compare greater-than or equal
+
+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.U16 q0, q0, q0
+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+//Vector compare less-than or equal
+
+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.U16 q0, q0, q0
+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+//Vector compare greater-than
+
+uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
+uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
+uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.U16 q0, q0, q0
+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
+//Vector compare less-than
+
+uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
+uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
+uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.U16 q0, q0, q0
+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
+//Vector compare absolute greater-than or equal
+
+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
+//Vector compare absolute less-than or equal
+
+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
+//Vector compare absolute greater-than
+
+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
+//Vector compare absolute less-than
+
+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
+//Vector test bits
+
+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b);         // VTST.8 q0, q0, q0
+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b);         // VTST.16 q0, q0, q0
+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b);         // VTST.32 q0, q0, q0
+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b);         // VTST.8 q0, q0, q0
+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b);         // VTST.16 q0, q0, q0
+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b);         // VTST.32 q0, q0, q0
+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b);         // VTST.8 q0, q0, q0
+//Absolute difference
+//Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
+
+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b);         // VABD.S8 q0,q0,q0
+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b);         // VABD.S16 q0,q0,q0
+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b);         // VABD.S32 q0,q0,q0
+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b);         // VABD.U8 q0,q0,q0
+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b);         // VABD.U16 q0,q0,q0
+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b);         // VABD.U32 q0,q0,q0
+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b);         // VABD.F32 q0,q0,q0
+//Absolute difference - long
+
+//Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
+
+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VABA.S8 q0,q0,q0
+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VABA.S16 q0,q0,q0
+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VABA.S32 q0,q0,q0
+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VABA.U8 q0,q0,q0
+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VABA.U16 q0,q0,q0
+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VABA.U32 q0,q0,q0
+//Absolute difference and accumulate - long
+
+//Max/Min
+//vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
+
+int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b);         // VMAX.S8 q0,q0,q0
+int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b);         // VMAX.S16 q0,q0,q0
+int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b);         // VMAX.S32 q0,q0,q0
+uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b);         // VMAX.U8 q0,q0,q0
+uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b);         // VMAX.U16 q0,q0,q0
+uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b);         // VMAX.U32 q0,q0,q0
+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b);         // VMAX.F32 q0,q0,q0
+//vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
+
+int8x16_t vminq_s8(int8x16_t a, int8x16_t b);         // VMIN.S8 q0,q0,q0
+int16x8_t vminq_s16(int16x8_t a, int16x8_t b);         // VMIN.S16 q0,q0,q0
+int32x4_t vminq_s32(int32x4_t a, int32x4_t b);         // VMIN.S32 q0,q0,q0
+uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b);         // VMIN.U8 q0,q0,q0
+uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b);         // VMIN.U16 q0,q0,q0
+uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b);         // VMIN.U32 q0,q0,q0
+float32x4_t vminq_f32(float32x4_t a, float32x4_t b);         // VMIN.F32 q0,q0,q0
+//Pairwise addition
+//Pairwise add
+
+//Long pairwise add
+
+int16x8_t vpaddlq_s8(int8x16_t a);         // VPADDL.S8 q0,q0
+int32x4_t vpaddlq_s16(int16x8_t a);         // VPADDL.S16 q0,q0
+int64x2_t vpaddlq_s32(int32x4_t a);         // VPADDL.S32 q0,q0
+uint16x8_t vpaddlq_u8(uint8x16_t a);         // VPADDL.U8 q0,q0
+uint32x4_t vpaddlq_u16(uint16x8_t a);         // VPADDL.U16 q0,q0
+uint64x2_t vpaddlq_u32(uint32x4_t a);         // VPADDL.U32 q0,q0
+//Long pairwise add and accumulate
+
+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b);         // VPADAL.S8 q0,q0
+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b);         // VPADAL.S16 q0,q0
+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b);         // VPADAL.S32 q0,q0
+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b);         // VPADAL.U8 q0,q0
+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b);         // VPADAL.U16 q0,q0
+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b);         // VPADAL.U32 q0,q0
+//Folding maximum vpmax -> takes maximum of adjacent pairs
+
+//Folding minimum vpmin -> takes minimum of adjacent pairs
+
+//Reciprocal/Sqrt
+
+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b);         // VRECPS.F32 q0, q0, q0
+
+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b);         // VRSQRTS.F32 q0, q0, q0
+//Shifts by signed variable
+//Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
+
+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b);         // VSHL.S8 q0,q0,q0
+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b);         // VSHL.S16 q0,q0,q0
+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b);         // VSHL.S32 q0,q0,q0
+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b);         // VSHL.S64 q0,q0,q0
+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b);         // VSHL.U8 q0,q0,q0
+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b);         // VSHL.U16 q0,q0,q0
+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b);         // VSHL.U32 q0,q0,q0
+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b);         // VSHL.U64 q0,q0,q0
+//Vector saturating shift left: (negative values shift right)
+
+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b);         // VQSHL.S8 q0,q0,q0
+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b);         // VQSHL.S16 q0,q0,q0
+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b);         // VQSHL.S32 q0,q0,q0
+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b);         // VQSHL.S64 q0,q0,q0
+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b);         // VQSHL.U8 q0,q0,q0
+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b);         // VQSHL.U16 q0,q0,q0
+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b);         // VQSHL.U32 q0,q0,q0
+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b);         // VQSHL.U64 q0,q0,q0
+//Vector rounding shift left: (negative values shift right)
+
+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b);         // VRSHL.S8 q0,q0,q0
+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b);         // VRSHL.S16 q0,q0,q0
+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b);         // VRSHL.S32 q0,q0,q0
+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b);         // VRSHL.S64 q0,q0,q0
+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b);         // VRSHL.U8 q0,q0,q0
+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b);         // VRSHL.U16 q0,q0,q0
+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b);         // VRSHL.U32 q0,q0,q0
+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b);         // VRSHL.U64 q0,q0,q0
+//Vector saturating rounding shift left: (negative values shift right)
+
+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b);         // VQRSHL.S8 q0,q0,q0
+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b);         // VQRSHL.S16 q0,q0,q0
+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b);         // VQRSHL.S32 q0,q0,q0
+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b);         // VQRSHL.S64 q0,q0,q0
+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b);         // VQRSHL.U8 q0,q0,q0
+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b);         // VQRSHL.U16 q0,q0,q0
+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b);         // VQRSHL.U32 q0,q0,q0
+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);         // VQRSHL.U64 q0,q0,q0
+//Shifts by a constant
+//Vector shift right by constant
+
+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VSHR.S8 q0,q0,#8
+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VSHR.S16 q0,q0,#16
+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VSHR.S32 q0,q0,#32
+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VSHR.S64 q0,q0,#64
+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VSHR.U8 q0,q0,#8
+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VSHR.U16 q0,q0,#16
+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VSHR.U32 q0,q0,#32
+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VSHR.U64 q0,q0,#64
+//Vector shift left by constant
+
+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+//Vector rounding shift right by constant
+
+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VRSHR.S8 q0,q0,#8
+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VRSHR.S16 q0,q0,#16
+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VRSHR.S32 q0,q0,#32
+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VRSHR.S64 q0,q0,#64
+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VRSHR.U8 q0,q0,#8
+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VRSHR.U16 q0,q0,#16
+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VRSHR.U32 q0,q0,#32
+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VRSHR.U64 q0,q0,#64
+//Vector shift right by constant and accumulate
+
+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRA.S8 q0,q0,#8
+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRA.S16 q0,q0,#16
+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRA.S32 q0,q0,#32
+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRA.S64 q0,q0,#64
+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRA.U8 q0,q0,#8
+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRA.U16 q0,q0,#16
+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRA.U32 q0,q0,#32
+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRA.U64 q0,q0,#64
+//Vector rounding shift right by constant and accumulate
+
+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VRSRA.S8 q0,q0,#8
+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VRSRA.S16 q0,q0,#16
+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VRSRA.S32 q0,q0,#32
+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VRSRA.S64 q0,q0,#64
+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VRSRA.U8 q0,q0,#8
+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VRSRA.U16 q0,q0,#16
+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VRSRA.U32 q0,q0,#32
+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VRSRA.U64 q0,q0,#64
+//Vector saturating shift left by constant
+
+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHL.S8 q0,q0,#0
+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHL.S16 q0,q0,#0
+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHL.S32 q0,q0,#0
+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHL.S64 q0,q0,#0
+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VQSHL.U8 q0,q0,#0
+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VQSHL.U16 q0,q0,#0
+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VQSHL.U32 q0,q0,#0
+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VQSHL.U64 q0,q0,#0
+//Vector signed->unsigned saturating shift left by constant
+
+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHLU.S8 q0,q0,#0
+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHLU.S16 q0,q0,#0
+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHLU.S32 q0,q0,#0
+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHLU.S64 q0,q0,#0
+//Vector narrowing shift right by constant
+
+//Vector signed->unsigned narrowing saturating shift right by constant
+
+//Vector signed->unsigned rounding narrowing saturating shift right by constant
+
+//Vector narrowing saturating shift right by constant
+
+//Vector rounding narrowing shift right by constant
+
+//Vector rounding narrowing saturating shift right by constant
+
+//Vector widening shift left by constant
+
+//Shifts with insert
+//Vector shift right and insert
+
+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+//Vector shift left and insert
+
+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+//Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
+//Load a single vector from memory
+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr);         // VLD1.16 {d0, d1}, [r0]
+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+
+//Load a single lane from memory
+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
+uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane);         //VLD1.16 {d0[0]}, [r0]
+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane);         //VLD1.32 {d0[0]}, [r0]
+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane);         //VLD1.64 {d0}, [r0]
+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+
+//Load all lanes of vector with same value from memory
+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr);         // VLD1.64 {d0}, [r0]
+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr);         // VLD1.64 {d0}, [r0]
+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr);         // VLD1.16 {d0[]}, [r0]
+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+
+//Store a single vector or lane. Stores all lanes or a single lane of a vector.
+//Store a single vector into memory
+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val);         // VST1.64 {d0, d1}, [r0]
+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val);         // VST1.64 {d0, d1}, [r0]
+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val);         // VST1.16 {d0, d1}, [r0]
+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val);         // VST1.32 {d0, d1}, [r0]
+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val);         // VST1.8 {d0, d1}, [r0]
+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val);         // VST1.16 {d0, d1}, [r0]
+
+//Store a lane of a vector into memory
+//Loads of an N-element structure
+//Load N-element structure from memory
+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr);         // VLD2.16 {d0, d2}, [r0]
+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+//float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+//Load all lanes of N-element structure with same value from memory
+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+//float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+//Load a single lane of N-element structure from memory
+//the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
+uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
+uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
+int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
+int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
+float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
+float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
+poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
+uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
+uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
+uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane);         //VLD2.16 {d0[0], d1[0]}, [r0]
+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane);         //VLD2.32 {d0[0], d1[0]}, [r0]
+//float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
+uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane);         //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane);         //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane);         // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);         //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);         //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane);         // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+//Store N-element structure to memory
+void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
+void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
+void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
+void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
+void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
+void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
+void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);         // VST1.64 {d0, d1}, [r0]
+void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
+void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
+void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
+void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val);         // VST1.64 {d0, d1}, [r0]
+//void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
+void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
+void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
+void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
+void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
+void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
+void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
+void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
+void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
+void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val);         // VST1.64 {d0, d1, d2}, [r0]
+void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
+void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
+void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
+void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val);         // VST1.64 {d0, d1, d2}, [r0]
+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
+void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
+void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val);         // VST1.64 {d0, d1, d2, d3}, [r0]
+void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);         // VST1.64 {d0, d1, d2, d3}, [r0]
+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
+//Store a single lane of N-element structure to memory
+void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane);         // VST2.32{d0[0], d2[0]}, [r0]
+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);         // VST2.32{d0[0], d2[0]}, [r0]
+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
+void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane);         //VST2.32 {d0[0], d2[0]}, [r0]
+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
+void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8{d0[0], d1[0]}, [r0]
+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8 {d0[0],d1[0]}, [r0]
+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
+void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8{d0[0], d1[0]}, [r0]
+void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
+void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane);         // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);         // VST3.32{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane);         //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
+void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8{d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
+void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane);         // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);         // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane);         //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
+//Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
+
+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.U16 r0, d0[0]
+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane);         // VMOV.S8 r0, d0[0]
+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane);         // VMOV.S16 r0, d0[0]
+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.U16 r0, d0[0]
+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+
+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+//Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
+
+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+
+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+//Initialize a vector from a literal bit pattern.
+
+//Set all lanes to same value
+//Load all lanes of vector to the same literal value
+
+uint8x16_t vdupq_n_u8(uint8_t value);         // VDUP.8 q0,r0
+uint16x8_t vdupq_n_u16(uint16_t value);         // VDUP.16 q0,r0
+uint32x4_t vdupq_n_u32(uint32_t value);         // VDUP.32 q0,r0
+int8x16_t vdupq_n_s8(int8_t value);         // VDUP.8 q0,r0
+int16x8_t vdupq_n_s16(int16_t value);         // VDUP.16 q0,r0
+int32x4_t vdupq_n_s32(int32_t value);         // VDUP.32 q0,r0
+poly8x16_t vdupq_n_p8(poly8_t value);         // VDUP.8 q0,r0
+poly16x8_t vdupq_n_p16(poly16_t value);         // VDUP.16 q0,r0
+float32x4_t vdupq_n_f32(float32_t value);         // VDUP.32 q0,r0
+
+int64x2_t vdupq_n_s64(int64_t value);         // VMOV d0,r0,r0
+uint64x2_t vdupq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+
+uint8x16_t vmovq_n_u8(uint8_t value);         // VDUP.8 q0,r0
+uint16x8_t vmovq_n_u16(uint16_t value);         // VDUP.16 q0,r0
+uint32x4_t vmovq_n_u32(uint32_t value);         // VDUP.32 q0,r0
+int8x16_t vmovq_n_s8(int8_t value);         // VDUP.8 q0,r0
+int16x8_t vmovq_n_s16(int16_t value);         // VDUP.16 q0,r0
+int32x4_t vmovq_n_s32(int32_t value);         // VDUP.32 q0,r0
+poly8x16_t vmovq_n_p8(poly8_t value);         // VDUP.8 q0,r0
+poly16x8_t vmovq_n_p16(poly16_t value);         // VDUP.16 q0,r0
+float32x4_t vmovq_n_f32(float32_t value);         // VDUP.32 q0,r0
+
+int64x2_t vmovq_n_s64(int64_t value);         // VMOV d0,r0,r0
+uint64x2_t vmovq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+//Load all lanes of the vector to the value of a lane of a vector
+
+//Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
+
+//Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
+
+//Converting vectors. These intrinsics are used to convert vectors.
+//Convert from float
+
+int32x4_t vcvtq_s32_f32(float32x4_t a);         // VCVT.S32.F32 q0, q0
+uint32x4_t vcvtq_u32_f32(float32x4_t a);         // VCVT.U32.F32 q0, q0
+
+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.S32.F32 q0, q0, #32
+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.U32.F32 q0, q0, #32
+//Convert to float
+
+float32x4_t vcvtq_f32_s32(int32x4_t a);         // VCVT.F32.S32 q0, q0
+float32x4_t vcvtq_f32_u32(uint32x4_t a);         // VCVT.F32.U32 q0, q0
+
+float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b);         // VCVT.F32.S32 q0, q0, #32
+float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b);         // VCVT.F32.U32 q0, q0, #32
+//Convert between floats
+
+//Vector narrow integer
+
+//Vector long move
+
+//Vector saturating narrow integer
+
+//Vector saturating narrow integer signed->unsigned
+
+//Table look up
+
+//Extended table look up intrinsics
+
+//Operations with a scalar value
+//Vector multiply accumulate with scalar
+
+//Vector widening multiply accumulate with scalar
+
+//Vector widening saturating doubling multiply accumulate with scalar
+
+//Vector multiply subtract with scalar
+
+//Vector widening multiply subtract with scalar
+
+//Vector widening saturating doubling multiply subtract with scalar
+
+//Vector multiply by scalar
+
+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b);         // VMUL.I16 q0,q0,d0[0]
+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b);         // VMUL.I32 q0,q0,d0[0]
+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);         // VMUL.F32 q0,q0,d0[0]
+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b);         // VMUL.I16 q0,q0,d0[0]
+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b);         // VMUL.I32 q0,q0,d0[0]
+//Vector long multiply with scalar
+
+//Vector long multiply by scalar
+
+//Vector saturating doubling long multiply with scalar
+
+//Vector saturating doubling long multiply by scalar
+
+//Vector saturating doubling multiply high with scalar
+
+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQDMULH.S16 q0,q0,d0[0]
+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQDMULH.S32 q0,q0,d0[0]
+//Vector saturating doubling multiply high by scalar
+
+//Vector saturating rounding doubling multiply high with scalar
+
+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQRDMULH.S16 q0,q0,d0[0]
+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQRDMULH.S32 q0,q0,d0[0]
+//Vector rounding saturating doubling multiply high by scalar
+
+//Vector multiply accumulate with scalar
+
+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLA.I16 q0, q0, d0[0]
+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLA.I32 q0, q0, d0[0]
+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLA.I16 q0, q0, d0[0]
+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLA.I32 q0, q0, d0[0]
+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLA.F32 q0, q0, d0[0]
+//Vector widening multiply accumulate with scalar
+
+//Vector widening saturating doubling multiply accumulate with scalar
+
+//Vector multiply subtract with scalar
+
+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLS.I16 q0, q0, d0[0]
+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLS.I32 q0, q0, d0[0]
+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLS.I16 q0, q0, d0[0]
+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLS.I32 q0, q0, d0[0]
+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLS.F32 q0, q0, d0[0]
+//Vector widening multiply subtract with scalar
+
+//Vector widening saturating doubling multiply subtract with scalar
+
+//Vector extract
+
+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+//Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
+
+int8x16_t vrev64q_s8(int8x16_t vec);         // VREV64.8 q0,q0
+int16x8_t vrev64q_s16(int16x8_t vec);         // VREV64.16 q0,q0
+int32x4_t vrev64q_s32(int32x4_t vec);         // VREV64.32 q0,q0
+uint8x16_t vrev64q_u8(uint8x16_t vec);         // VREV64.8 q0,q0
+uint16x8_t vrev64q_u16(uint16x8_t vec);         // VREV64.16 q0,q0
+uint32x4_t vrev64q_u32(uint32x4_t vec);         // VREV64.32 q0,q0
+poly8x16_t vrev64q_p8(poly8x16_t vec);         // VREV64.8 q0,q0
+poly16x8_t vrev64q_p16(poly16x8_t vec);         // VREV64.16 q0,q0
+float32x4_t vrev64q_f32(float32x4_t vec);         // VREV64.32 q0,q0
+
+int8x16_t vrev32q_s8(int8x16_t vec);         // VREV32.8 q0,q0
+int16x8_t vrev32q_s16(int16x8_t vec);         // VREV32.16 q0,q0
+uint8x16_t vrev32q_u8(uint8x16_t vec);         // VREV32.8 q0,q0
+uint16x8_t vrev32q_u16(uint16x8_t vec);         // VREV32.16 q0,q0
+poly8x16_t vrev32q_p8(poly8x16_t vec);         // VREV32.8 q0,q0
+
+int8x16_t vrev16q_s8(int8x16_t vec);         // VREV16.8 q0,q0
+uint8x16_t vrev16q_u8(uint8x16_t vec);         // VREV16.8 q0,q0
+poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
+//Other single operand arithmetic
+//Absolute: Vd[i] = |Va[i]|
+
+int8x16_t vabsq_s8(int8x16_t a);         // VABS.S8 q0,q0
+int16x8_t vabsq_s16(int16x8_t a);         // VABS.S16 q0,q0
+int32x4_t vabsq_s32(int32x4_t a);         // VABS.S32 q0,q0
+float32x4_t vabsq_f32(float32x4_t a);         // VABS.F32 q0,q0
+//Saturating absolute: Vd[i] = sat(|Va[i]|)
+
+int8x16_t vqabsq_s8(int8x16_t a);         // VQABS.S8 q0,q0
+int16x8_t vqabsq_s16(int16x8_t a);         // VQABS.S16 q0,q0
+int32x4_t vqabsq_s32(int32x4_t a);         // VQABS.S32 q0,q0
+//Negate: Vd[i] = - Va[i]
+
+int8x16_t vnegq_s8(int8x16_t a);         // VNE//q0,q0
+int16x8_t vnegq_s16(int16x8_t a);         // VNE//q0,q0
+int32x4_t vnegq_s32(int32x4_t a);         // VNE//q0,q0
+float32x4_t vnegq_f32(float32x4_t a);         // VNE//q0,q0
+//Saturating Negate: sat(Vd[i] = - Va[i])
+
+int8x16_t vqnegq_s8(int8x16_t a);         // VQNE//q0,q0
+int16x8_t vqnegq_s16(int16x8_t a);         // VQNE//q0,q0
+int32x4_t vqnegq_s32(int32x4_t a);         // VQNE//q0,q0
+//Count leading sign bits
+
+int8x16_t vclsq_s8(int8x16_t a);         // VCLS.S8 q0,q0
+int16x8_t vclsq_s16(int16x8_t a);         // VCLS.S16 q0,q0
+int32x4_t vclsq_s32(int32x4_t a);         // VCLS.S32 q0,q0
+//Count leading zeros
+
+int8x16_t vclzq_s8(int8x16_t a);         // VCLZ.I8 q0,q0
+int16x8_t vclzq_s16(int16x8_t a);         // VCLZ.I16 q0,q0
+int32x4_t vclzq_s32(int32x4_t a);         // VCLZ.I32 q0,q0
+uint8x16_t vclzq_u8(uint8x16_t a);         // VCLZ.I8 q0,q0
+uint16x8_t vclzq_u16(uint16x8_t a);         // VCLZ.I16 q0,q0
+uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
+//Count number of set bits
+
+uint8x16_t vcntq_u8(uint8x16_t a);         // VCNT.8 q0,q0
+int8x16_t vcntq_s8(int8x16_t a);         // VCNT.8 q0,q0
+poly8x16_t vcntq_p8(poly8x16_t a);         // VCNT.8 q0,q0
+//Reciprocal estimate
+
+float32x4_t vrecpeq_f32(float32x4_t a);         // VRECPE.F32 q0,q0
+uint32x4_t vrecpeq_u32(uint32x4_t a);         // VRECPE.U32 q0,q0
+//Reciprocal square root estimate
+
+float32x4_t vrsqrteq_f32(float32x4_t a);         // VRSQRTE.F32 q0,q0
+uint32x4_t vrsqrteq_u32(uint32x4_t a);         // VRSQRTE.U32 q0,q0
+//Logical operations
+//Bitwise not
+
+int8x16_t vmvnq_s8(int8x16_t a);         // VMVN q0,q0
+int16x8_t vmvnq_s16(int16x8_t a);         // VMVN q0,q0
+int32x4_t vmvnq_s32(int32x4_t a);         // VMVN q0,q0
+uint8x16_t vmvnq_u8(uint8x16_t a);         // VMVN q0,q0
+uint16x8_t vmvnq_u16(uint16x8_t a);         // VMVN q0,q0
+uint32x4_t vmvnq_u32(uint32x4_t a);         // VMVN q0,q0
+poly8x16_t vmvnq_p8(poly8x16_t a);         // VMVN q0,q0
+//Bitwise and
+
+int8x16_t vandq_s8(int8x16_t a, int8x16_t b);         // VAND q0,q0,q0
+int16x8_t vandq_s16(int16x8_t a, int16x8_t b);         // VAND q0,q0,q0
+int32x4_t vandq_s32(int32x4_t a, int32x4_t b);         // VAND q0,q0,q0
+int64x2_t vandq_s64(int64x2_t a, int64x2_t b);         // VAND q0,q0,q0
+uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b);         // VAND q0,q0,q0
+uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b);         // VAND q0,q0,q0
+uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b);         // VAND q0,q0,q0
+uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b);         // VAND q0,q0,q0
+//Bitwise or
+
+int8x16_t vorrq_s8(int8x16_t a, int8x16_t b);         // VORR q0,q0,q0
+int16x8_t vorrq_s16(int16x8_t a, int16x8_t b);         // VORR q0,q0,q0
+int32x4_t vorrq_s32(int32x4_t a, int32x4_t b);         // VORR q0,q0,q0
+int64x2_t vorrq_s64(int64x2_t a, int64x2_t b);         // VORR q0,q0,q0
+uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b);         // VORR q0,q0,q0
+uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b);         // VORR q0,q0,q0
+uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b);         // VORR q0,q0,q0
+uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b);         // VORR q0,q0,q0
+//Bitwise exclusive or (EOR or XOR)
+
+int8x16_t veorq_s8(int8x16_t a, int8x16_t b);         // VEOR q0,q0,q0
+int16x8_t veorq_s16(int16x8_t a, int16x8_t b);         // VEOR q0,q0,q0
+int32x4_t veorq_s32(int32x4_t a, int32x4_t b);         // VEOR q0,q0,q0
+int64x2_t veorq_s64(int64x2_t a, int64x2_t b);         // VEOR q0,q0,q0
+uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b);         // VEOR q0,q0,q0
+uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b);         // VEOR q0,q0,q0
+uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b);         // VEOR q0,q0,q0
+uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b);         // VEOR q0,q0,q0
+//Bit Clear
+
+int8x16_t vbicq_s8(int8x16_t a, int8x16_t b);         // VBIC q0,q0,q0
+int16x8_t vbicq_s16(int16x8_t a, int16x8_t b);         // VBIC q0,q0,q0
+int32x4_t vbicq_s32(int32x4_t a, int32x4_t b);         // VBIC q0,q0,q0
+int64x2_t vbicq_s64(int64x2_t a, int64x2_t b);         // VBIC q0,q0,q0
+uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b);         // VBIC q0,q0,q0
+uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b);         // VBIC q0,q0,q0
+uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b);         // VBIC q0,q0,q0
+uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b);         // VBIC q0,q0,q0
+//Bitwise OR complement
+
+int8x16_t vornq_s8(int8x16_t a, int8x16_t b);         // VORN q0,q0,q0
+int16x8_t vornq_s16(int16x8_t a, int16x8_t b);         // VORN q0,q0,q0
+int32x4_t vornq_s32(int32x4_t a, int32x4_t b);         // VORN q0,q0,q0
+int64x2_t vornq_s64(int64x2_t a, int64x2_t b);         // VORN q0,q0,q0
+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b);         // VORN q0,q0,q0
+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b);         // VORN q0,q0,q0
+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b);         // VORN q0,q0,q0
+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
+//Bitwise Select
+
+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c);         // VBSL q0,q0,q0
+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c);         // VBSL q0,q0,q0
+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c);         // VBSL q0,q0,q0
+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c);         // VBSL q0,q0,q0
+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VBSL q0,q0,q0
+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VBSL q0,q0,q0
+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VBSL q0,q0,q0
+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);         // VBSL q0,q0,q0
+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c);         // VBSL q0,q0,q0
+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c);         // VBSL q0,q0,q0
+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL q0,q0,q0
+//Transposition operations
+//Transpose elements
+
+int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b);         // VTRN.8 q0,q0
+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b);         // VTRN.16 q0,q0
+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b);         // VTRN.32 q0,q0
+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b);         // VTRN.8 q0,q0
+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b);         // VTRN.16 q0,q0
+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b);         // VTRN.32 q0,q0
+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b);         // VTRN.32 q0,q0
+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b);         // VTRN.8 q0,q0
+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b);         // VTRN.16 q0,q0
+//Interleave elements
+
+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b);         // VZIP.8 q0,q0
+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b);         // VZIP.16 q0,q0
+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b);         // VZIP.32 q0,q0
+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b);         // VZIP.8 q0,q0
+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b);         // VZIP.16 q0,q0
+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b);         // VZIP.32 q0,q0
+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b);         // VZIP.32 q0,q0
+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b);         // VZIP.8 q0,q0
+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b);         // VZIP.16 q0,q0
+//De-Interleave elements
+
+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b);         // VUZP.8 q0,q0
+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b);         // VUZP.16 q0,q0
+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b);         // VUZP.32 q0,q0
+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b);         // VUZP.8 q0,q0
+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b);         // VUZP.16 q0,q0
+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b);         // VUZP.32 q0,q0
+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b);         // VUZP.32 q0,q0
+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b);         // VUZP.8 q0,q0
+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
+
+//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+// the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
+//for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
+//
+#if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined __INTEL_COMPILER )&& defined NDEBUG     //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
+
+    #if defined(USE_SSSE3)
+        #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
+    #endif
+
+    #define _MM_EXTRACT_EPI16  _mm_extract_epi16
+    #define _MM_INSERT_EPI16 _mm_insert_epi16
+    #ifdef USE_SSE4
+        #define _MM_EXTRACT_EPI8  _mm_extract_epi8
+        #define _MM_EXTRACT_EPI32  _mm_extract_epi32
+        #define _MM_EXTRACT_PS  _mm_extract_ps
+
+        #define _MM_INSERT_EPI8  _mm_insert_epi8
+        #define _MM_INSERT_EPI32 _mm_insert_epi32
+        #define _MM_INSERT_PS    _mm_insert_ps
+    #ifdef  _M_X64
+            #define _MM_INSERT_EPI64 _mm_insert_epi64
+            #define _MM_EXTRACT_EPI64 _mm_extract_epi64
+    #endif
+    #endif     //SSE4
+#else
+    #define _NEON2SSE_COMMA ,
+    #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
+            switch(LANE)         \
+        {                \
+        case 0:     return NAME(a b, 0); \
+        case 1:     return NAME(a b, 1); \
+        case 2:     return NAME(a b, 2); \
+        case 3:     return NAME(a b, 3); \
+        case 4:     return NAME(a b, 4); \
+        case 5:     return NAME(a b, 5); \
+        case 6:     return NAME(a b, 6); \
+        case 7:     return NAME(a b, 7); \
+        case 8:     return NAME(a b, 8); \
+        case 9:     return NAME(a b, 9); \
+        case 10:    return NAME(a b, 10); \
+        case 11:    return NAME(a b, 11); \
+        case 12:    return NAME(a b, 12); \
+        case 13:    return NAME(a b, 13); \
+        case 14:    return NAME(a b, 14); \
+        case 15:    return NAME(a b, 15); \
+        default:    return NAME(a b, 0); \
+        }
+
+    #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
+            switch(LANE)              \
+        {                          \
+        case 0:  return NAME(vec p,0); \
+        case 1:  return NAME(vec p,1); \
+        case 2:  return NAME(vec p,2); \
+        case 3:  return NAME(vec p,3); \
+        case 4:  return NAME(vec p,4); \
+        case 5:  return NAME(vec p,5); \
+        case 6:  return NAME(vec p,6); \
+        case 7:  return NAME(vec p,7); \
+        default: return NAME(vec p,0); \
+        }
+
+    #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
+            switch(LANE)              \
+        {                          \
+        case case0:  return NAME(vec p,case0); \
+        case case1:  return NAME(vec p,case1); \
+        case case2:  return NAME(vec p,case2); \
+        case case3:  return NAME(vec p,case3); \
+        default:     return NAME(vec p,case0); \
+        }
+
+    #if defined(USE_SSSE3)
+    _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
+    {
+        _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
+    }
+    #endif
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
+    }
+
+    #ifdef USE_SSE4
+        _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
+        }
+
+        _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
+        }
+
+        _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
+        {
+            _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
+        }
+
+        _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
+        }
+
+        _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
+        {
+            _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
+        }
+    #ifdef  _M_X64
+            _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
+            {
+                switch(LANE)
+                {
+                case 0:
+                    return _mm_insert_epi64(vec,  p, 0);
+                case 1:
+                    return _mm_insert_epi64(vec,  p, 1);
+                default:
+                    return _mm_insert_epi64(vec,  p, 0);
+                }
+            }
+
+            _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
+            {
+                if (LANE ==0) return _mm_extract_epi64(val, 0);
+                else return _mm_extract_epi64(val, 1);
+            }
+    #endif
+        _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
+        {
+            _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
+        }
+
+    #endif     //USE_SSE4
+
+#endif     //#ifdef NDEBUG
+
+//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
+// or for some specific commonly used operations implementation missing in SSE
+#ifdef USE_SSE4
+    #define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
+    #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
+    #define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
+
+    #define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
+    #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
+    #define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
+
+    #define _MM_MAX_EPI8  _mm_max_epi8
+    #define _MM_MAX_EPI32 _mm_max_epi32
+    #define _MM_MAX_EPU16 _mm_max_epu16
+    #define _MM_MAX_EPU32 _mm_max_epu32
+
+    #define _MM_MIN_EPI8  _mm_min_epi8
+    #define _MM_MIN_EPI32 _mm_min_epi32
+    #define _MM_MIN_EPU16 _mm_min_epu16
+    #define _MM_MIN_EPU32 _mm_min_epu32
+
+    #define _MM_BLENDV_EPI8 _mm_blendv_epi8
+    #define _MM_PACKUS_EPI32 _mm_packus_epi32
+    #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
+
+    #define _MM_MULLO_EPI32 _mm_mullo_epi32
+    #define _MM_MUL_EPI32  _mm_mul_epi32
+#else     //no SSE4 !!!!!!
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi8(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi16(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        return _mm_unpacklo_epi32(a, zero);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi8(zero, a);
+        return _mm_unpacklo_epi8(a, sign);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi16(zero, a);
+        return _mm_unpacklo_epi16(a, sign);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
+    {
+        __m128i zero = _mm_setzero_si128();
+        __m128i sign = _mm_cmpgt_epi32(zero, a);
+        return _mm_unpacklo_epi32(a, sign);
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t tmp[4];
+        _mm_store_si128((__m128i*)tmp, vec);
+        return tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int8_t tmp[16];
+        _mm_store_si128((__m128i*)tmp, vec);
+        return (int)tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t tmp[4];
+        _mm_store_si128((__m128i*)tmp, _M128i(vec));
+        return tmp[LANE];
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
+        _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
+        __m128i vec_masked, p_masked;
+        pvec[LANE] = p;
+        mask[LANE] = 0x0;
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
+        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
+        return _mm_or_si128(vec_masked, p_masked);
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
+        _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
+        __m128i vec_masked, p_masked;
+        pvec[LANE] = (int8_t)p;
+        mask[LANE] = 0x0;
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
+        p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
+        return _mm_or_si128(vec_masked, p_masked);
+    }
+
+    _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
+        __m128 tmp, vec_masked, p_masked;
+        mask[LANE >> 4] = 0x0;         //here the LANE is not actural lane, need to deal with it
+        vec_masked = _mm_and_ps (*(__m128*)mask,vec);         //ready for p
+        p_masked = _mm_andnot_ps (*(__m128*)mask, p);         //ready for vec
+        tmp = _mm_or_ps(vec_masked, p_masked);
+        return tmp;
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi8 (a, b);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi32(a, b);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
+    {
+        __m128i c8000, b_s, a_s, cmp;
+        c8000 = _mm_cmpeq_epi16 (a,a);         //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15);         //0x8000
+        b_s = _mm_sub_epi16 (b, c8000);
+        a_s = _mm_sub_epi16 (a, c8000);
+        cmp = _mm_cmpgt_epi16 (a_s, b_s);         //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
+    {
+        __m128i c80000000, b_s, a_s, cmp;
+        c80000000 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31);         //0x80000000
+        b_s = _mm_sub_epi32 (b, c80000000);
+        a_s = _mm_sub_epi32 (a, c80000000);
+        cmp = _mm_cmpgt_epi32 (a_s, b_s);         //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi8 (b, a);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
+    {
+        __m128i cmp, resa, resb;
+        cmp = _mm_cmpgt_epi32(b, a);
+        resa = _mm_and_si128 (cmp, a);
+        resb = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(resa, resb);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
+    {
+        __m128i c8000, b_s, a_s, cmp;
+        c8000 = _mm_cmpeq_epi16 (a,a);         //0xffff
+        c8000 = _mm_slli_epi16 (c8000, 15);         //0x8000
+        b_s = _mm_sub_epi16 (b, c8000);
+        a_s = _mm_sub_epi16 (a, c8000);
+        cmp = _mm_cmpgt_epi16 (b_s, a_s);         //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
+    {
+        __m128i c80000000, b_s, a_s, cmp;
+        c80000000 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
+        c80000000 = _mm_slli_epi32 (c80000000, 31);         //0x80000000
+        b_s = _mm_sub_epi32 (b, c80000000);
+        a_s = _mm_sub_epi32 (a, c80000000);
+        cmp = _mm_cmpgt_epi32 (b_s, a_s);         //no unsigned comparison, need to go to signed
+        a_s = _mm_and_si128 (cmp,a);
+        b_s = _mm_andnot_si128 (cmp,b);
+        return _mm_or_si128(a_s, b_s);
+    }
+
+    _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask)         //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
+    {         //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
+        __m128i a_masked, b_masked;
+        b_masked = _mm_and_si128 (mask,b);         //use b if mask 0xff
+        a_masked = _mm_andnot_si128 (mask,a);
+        return _mm_or_si128(a_masked, b_masked);
+    }
+
+    #if defined(USE_SSSE3)
+    _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
+    {
+        _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
+        __m128i a16, b16, res, reshi,cmp, zero;
+        zero = _mm_setzero_si128();
+        a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
+        b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
+        res = _mm_unpacklo_epi64(a16, b16);         //result without saturation
+        reshi = _mm_unpackhi_epi64(a16, b16);         //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi);         //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp,res);         //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero);         //if cmp positive
+        return _mm_or_si128(res, cmp);         //if cmp positive we are out of 16bits need to saturaate to 0xffff
+    }
+    #endif
+
+    #if defined(USE_SSSE3)
+    _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
+    {
+        _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
+        __m128i a16, res, reshi,cmp, zero;
+        zero = _mm_setzero_si128();
+        a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
+        reshi = _mm_unpackhi_epi64(a16, a16);         //hi part of result used for saturation
+        cmp = _mm_cmpgt_epi16(zero, reshi);         //if cmp<0 the result should be zero
+        res = _mm_andnot_si128(cmp, a16);         //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
+        cmp = _mm_cmpgt_epi16(reshi,zero);         //if cmp positive
+        return _mm_or_si128(res, cmp);         //if cmp positive we are out of 16bits need to saturaate to 0xffff
+    }
+    #endif
+
+    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
+    {
+        _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
+        int64_t res64;
+        int i;
+        _mm_store_si128((__m128i*)atmp, a);
+        _mm_store_si128((__m128i*)btmp, b);
+        for (i = 0; i<4; i++) {
+            res64 = atmp[i] * btmp[i];
+            res[i] = (int)(res64 & 0xffffffff);
+        }
+        return _mm_load_si128((__m128i*)res);
+    }
+
+    #if defined(USE_SSSE3)
+    _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
+    {
+        __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
+        sign = _mm_xor_si128 (a, b);
+        sign =  _mm_srai_epi32 (sign, 31);         //promote sign bit to all fields, all fff if negative and all 0 if positive
+        zero = _mm_setzero_si128();
+        a_neg = _mm_abs_epi32 (a);         //negate a and b
+        b_neg = _mm_abs_epi32 (b);         //negate a and b
+        mul_us = _mm_mul_epu32 (a_neg, b_neg);         //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
+        mul_us_neg = _mm_sub_epi64(zero, mul_us);
+        mul_us_neg = _mm_and_si128(sign, mul_us_neg);
+        mul_us = _mm_andnot_si128(sign, mul_us);
+        return _mm_or_si128 (mul_us, mul_us_neg);
+    }
+    #endif
+#endif     //SSE4
+
+#ifndef _MM_INSERT_EPI64     //special case of SSE4 and  _M_X64
+    _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
+        _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff,0xffffffffffffffff};
+        __m128i vec_masked, p_masked;
+        pvec[LANE] = p;
+        mask[LANE] = 0x0;
+        vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
+        p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
+        return _mm_or_si128(vec_masked, p_masked);
+    }
+#endif
+#ifndef _MM_EXTRACT_EPI64     //special case of SSE4 and  _M_X64
+    _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
+    {
+        _NEON2SSE_ALIGN_16 int64_t tmp[2];
+        _mm_store_si128((__m128i*)tmp, val);
+        return tmp[LANE];
+    }
+#endif
+
+int32x4_t  vqd_s32(int32x4_t a);         //Doubling saturation for signed ints
+_NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
+{         //Overflow happens only if a and sum have the opposite signs
+    __m128i c7fffffff, res, res_sat, res_xor_a;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_slli_epi32 (a, 1);         // res = a*2
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+//*************************************************************************
+//*************************************************************************
+//*****************  Functions redefinition\implementatin starts here *****
+//*************************************************************************
+//*************************************************************************
+//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+/*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
+#ifdef ARM
+#define vector_addq_s32 _mm_add_epi32
+#else //if we have IA
+#endif
+
+********************************************************************************************
+Functions below are organised in the following way:
+
+Each NEON intrinsic function has one of the following options:
+1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
+2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
+3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
+4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
+the serial implementation is provided along with the corresponding compiler warnin//these functions are on your app critical path
+- please consider such functions removal from your code.
+*/
+
+//***********************************************************************
+//************************      Vector add   *****************************
+//***********************************************************************
+
+int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b);         // VADD.I8 q0,q0,q0
+#define vaddq_s8 _mm_add_epi8
+
+int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b);         // VADD.I16 q0,q0,q0
+#define vaddq_s16 _mm_add_epi16
+
+int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b);         // VADD.I32 q0,q0,q0
+#define vaddq_s32 _mm_add_epi32
+
+int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b);         // VADD.I64 q0,q0,q0
+#define vaddq_s64 _mm_add_epi64
+
+float32x4_t vaddq_f32(float32x4_t a, float32x4_t b);         // VADD.F32 q0,q0,q0
+#define vaddq_f32 _mm_add_ps
+
+uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b);         // VADD.I8 q0,q0,q0
+#define vaddq_u8 _mm_add_epi8
+
+uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b);         // VADD.I16 q0,q0,q0
+#define vaddq_u16 _mm_add_epi16
+
+uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b);         // VADD.I32 q0,q0,q0
+#define vaddq_u32 _mm_add_epi32
+
+uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b);         // VADD.I64 q0,q0,q0
+#define vaddq_u64 _mm_add_epi64
+
+//**************************** Vector long add *****************************:
+//***********************************************************************
+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+
+//***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
+//*************** *********************************************************************
+
+//******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
+//*************************************************************************************************************************
+
+int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b);         // VHADD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
+{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = vshrq_n_s8(tmp2,1);
+    return _mm_add_epi8(tmp1,tmp2);
+}
+
+int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b);         // VHADD.S1 6 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
+{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srai_epi16(tmp2,1);
+    return _mm_add_epi16(tmp1,tmp2);
+}
+
+int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b);         // VHADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b)         // VHADD.S32 q0,q0,q0
+{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srai_epi32(tmp2,1);
+    return _mm_add_epi32(tmp1,tmp2);
+}
+
+uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b);         // VHADD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b)         // VHADD.U8 q0,q0,q0
+{
+    __m128i c1, sum, res;
+    c1 = _mm_set1_epi8(1);
+    sum = _mm_avg_epu8(a, b);         //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b);         //for rounding compensation
+    res = _mm_and_si128(res,c1);         //for rounding compensation
+    return _mm_sub_epi8 (sum, res);         //actual rounding compensation
+}
+
+uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b);         // VHADD.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b)         // VHADD.s16 q0,q0,q0
+{
+    __m128i sum, res;
+    sum = _mm_avg_epu16(a, b);         //result is rounded, need to compensate it
+    res = _mm_xor_si128(a, b);         //for rounding compensation
+    res = _mm_slli_epi16 (res,15);         //shift left  then back right to
+    res = _mm_srli_epi16 (res,15);         //get 1 or zero
+    return _mm_sub_epi16 (sum, res);         //actual rounding compensation
+}
+
+uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b);         // VHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b)         // VHADD.U32 q0,q0,q0
+{         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_and_si128(a,b);
+    tmp2 = _mm_xor_si128(a,b);
+    tmp2 = _mm_srli_epi32(tmp2,1);
+    return _mm_add_epi32(tmp1,tmp2);
+}
+
+//************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
+//*****************************************************************************************************************************
+
+//SSE, result rounding!!!
+
+//SSE, result rounding!!!
+
+int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b);         // VRHADD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b)         // VRHADD.S8 q0,q0,q0
+{         //no signed average in x86 SIMD, go to unsigned
+    __m128i c128, au, bu, sum;
+    c128 = _mm_set1_epi8(128);
+    au = _mm_add_epi8(a, c128);
+    bu = _mm_add_epi8(b, c128);
+    sum = _mm_avg_epu8(au, bu);
+    return _mm_sub_epi8 (sum, c128);
+}
+
+int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b);         // VRHADD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b)         // VRHADD.S16 q0,q0,q0
+{         //no signed average in x86 SIMD, go to unsigned
+    __m128i cx8000, au, bu, sum;
+    cx8000 = _mm_set1_epi16(0x8000);
+    au = _mm_add_epi16(a, cx8000);
+    bu = _mm_add_epi16(b, cx8000);
+    sum = _mm_avg_epu16(au, bu);
+    return _mm_sub_epi16 (sum, cx8000);
+}
+
+int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b);         // VRHADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
+{         //need to avoid overflow
+    __m128i a2, b2, res, sum;
+    a2 = _mm_srai_epi32(a,1);         //a2=a/2;
+    b2 = _mm_srai_epi32(b,1);         // b2=b/2;
+    res = _mm_or_si128(a,b);         //for rounding
+    res = _mm_slli_epi32 (res,31);         //shift left  then back right to
+    res = _mm_srli_epi32 (res,31);         //get 1 or zero
+    sum = _mm_add_epi32(a2,b2);
+    return _mm_add_epi32(sum,res);
+}
+
+uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b);         // VRHADD.U8 q0,q0,q0
+#define vrhaddq_u8 _mm_avg_epu8         //SSE2, results rounded
+
+uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b);         // VRHADD.s16 q0,q0,q0
+#define vrhaddq_u16 _mm_avg_epu16         //SSE2, results rounded
+
+uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b);         // VRHADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b)         // VRHADD.U32 q0,q0,q0
+{         //need to avoid overflow
+    __m128i a2, b2, res, sum;
+    a2 = _mm_srli_epi32(a,1);         //a2=a/2;
+    b2 = _mm_srli_epi32(b,1);         // b2=b/2;
+    res = _mm_or_si128(a,b);         //for rounding
+    res = _mm_slli_epi32 (res,31);         //shift left  then back right to
+    res = _mm_srli_epi32 (res,31);         //get 1 or zero
+    sum = _mm_add_epi32(a2,b2);
+    return _mm_add_epi32(sum,res);
+}
+
+//****************** VQADD: Vector saturating add ************************
+//************************************************************************
+
+int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b);         // VQADD.S8 q0,q0,q0
+#define vqaddq_s8 _mm_adds_epi8
+
+int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b);         // VQADD.S16 q0,q0,q0
+#define vqaddq_s16 _mm_adds_epi16
+
+int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b);         // VQADD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
+{         //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
+    __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_add_epi32(a, b);
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    b_xor_a_ = _mm_xor_si128(b, a);
+    res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b);         // VQADD.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    _mm_store_si128((__m128i*)btmp, b);
+    res[0] = atmp[0] + btmp[0];
+    res[1] = atmp[1] + btmp[1];
+
+    atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
+    atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
+
+    if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
+        res[0] = atmp[0];
+    }
+    if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
+        res[1] = atmp[1];
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b);         // VQADD.U8 q0,q0,q0
+#define vqaddq_u8 _mm_adds_epu8
+
+uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b);         // VQADD.s16 q0,q0,q0
+#define vqaddq_u16 _mm_adds_epu16
+
+uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b);         // VQADD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
+{
+    __m128i c80000000, cmp, subsum, suba, sum;
+    c80000000 = _mm_set1_epi32 (0x80000000);
+    sum = _mm_add_epi32 (a, b);
+    subsum = _mm_sub_epi32 (sum, c80000000);
+    suba = _mm_sub_epi32 (a, c80000000);
+    cmp = _mm_cmpgt_epi32 ( suba, subsum);         //no unsigned comparison, need to go to signed
+    return _mm_or_si128 (sum, cmp);         //saturation
+}
+
+uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
+    {
+        __m128i c80000000, sum, cmp, suba, subsum;
+        c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
+        sum = _mm_add_epi64 (a, b);
+        subsum = _mm_sub_epi64 (sum, c80000000);
+        suba = _mm_sub_epi64 (a, c80000000);
+        cmp = _mm_cmpgt_epi64 ( suba, subsum);         //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_or_si128 (sum, cmp);         //saturation
+    }
+#else
+    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+    {
+        _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+        _mm_store_si128((__m128i*)atmp, a);
+        _mm_store_si128((__m128i*)btmp, b);
+        res[0] = atmp[0] + btmp[0];
+        res[1] = atmp[1] + btmp[1];
+        if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
+        if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
+        return _mm_load_si128((__m128i*)(res));
+    }
+#endif
+
+//******************* Vector add high half (truncated)  ******************
+//************************************************************************
+
+//*********** Vector rounding add high half: vraddhn_<type> ******************.
+//***************************************************************************
+
+//**********************************************************************************
+//*********             Multiplication            *************************************
+//**************************************************************************************
+
+//Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
+//As we don't go to wider result functions are equal to "multiply low" in x86
+
+#if defined(USE_SSSE3)
+int8x16_t vmulq_s8(int8x16_t a, int8x16_t b);         // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b)         // VMUL.I8 q0,q0,q0
+{         // no 8 bit simd multiply, need to go to 16 bits
+      //solution may be not optimal
+    __m128i a16, b16, r16_1, r16_2;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    a16 = _MM_CVTEPI8_EPI16 (a);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (a16, b16);
+    //swap hi and low part of a and b to process the remaining data
+    a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    a16 = _MM_CVTEPI8_EPI16 (a16);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1  __m128i r16_2
+
+    r16_2 = _mm_mullo_epi16 (a16, b16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd);         //return to 8 bit
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd);         //return to 8 bit
+
+    return _mm_unpacklo_epi64(r16_1,  r16_2);
+}
+#endif
+
+int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b);         // VMUL.I16 q0,q0,q0
+#define vmulq_s16 _mm_mullo_epi16
+
+int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b);         // VMUL.I32 q0,q0,q0
+#define vmulq_s32 _MM_MULLO_EPI32         //SSE4.1
+
+float32x4_t vmulq_f32(float32x4_t a, float32x4_t b);         // VMUL.F32 q0,q0,q0
+#define vmulq_f32 _mm_mul_ps
+
+uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b);         // VMUL.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b)         // VMUL.I8 q0,q0,q0
+{         // no 8 bit simd multiply, need to go to 16 bits
+      //solution may be not optimal
+    __m128i maskff, a16, b16, r16_1, r16_2;
+    maskff = _mm_set1_epi16(0xff);
+    a16 = _MM_CVTEPU8_EPI16 (a);         // SSE 4.1
+    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (a16, b16);
+    r16_1 = _mm_and_si128(r16_1, maskff);         //to avoid saturation
+    //swap hi and low part of a and b to process the remaining data
+    a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    a16 = _MM_CVTEPI8_EPI16 (a16);         // SSE 4.1
+    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (a16, b16);
+    r16_2 = _mm_and_si128(r16_2, maskff);         //to avoid saturation
+    return _mm_packus_epi16 (r16_1,  r16_2);
+}
+
+uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b);         // VMUL.I16 q0,q0,q0
+#define vmulq_u16 _mm_mullo_epi16
+
+uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b);         // VMUL.I32 q0,q0,q0
+#define vmulq_u32 _MM_MULLO_EPI32         //SSE4.1
+
+poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b);         // VMUL.P8 q0,q0,q0
+_NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
+{         //may be optimized
+    __m128i c1, res, tmp, bmasked;
+    int i;
+    c1 = _mm_cmpeq_epi8 (a,a);         //all ones 0xff....
+    c1 = vshrq_n_u8(c1,7);         //0x1
+    bmasked = _mm_and_si128(b, c1);         //0x1
+    res = vmulq_u8(a, bmasked);
+    for(i = 1; i<8; i++) {
+        c1 = _mm_slli_epi16(c1,1);         //shift mask left by 1, 16 bit shift is OK here
+        bmasked = _mm_and_si128(b, c1);         //0x1
+        tmp = vmulq_u8(a, bmasked);
+        res = _mm_xor_si128(res, tmp);
+    }
+    return res;
+}
+
+//************************* Vector long multiply ***********************************
+//****************************************************************************
+
+//****************Vector saturating doubling long multiply **************************
+//*****************************************************************
+
+//********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
+//******************************************************************************************
+
+#if defined(USE_SSSE3)
+int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VMLA.I8 q0,q0,q0
+{         //solution may be not optimal
+      // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2,r16_2;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c);         // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_add_epi8 (r16_1, a);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16);         // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_add_epi8(r16_2, a_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+#endif
+
+int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLA.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VMLA.I16 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_mullo_epi16 (c, b);
+    return _mm_add_epi16 (res, a);
+}
+
+int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLA.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VMLA.I32 q0,q0,q0
+{
+    __m128i res;
+    res = _MM_MULLO_EPI32 (c,  b);         //SSE4.1
+    return _mm_add_epi32 (res, a);
+}
+
+float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLA.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)         // VMLA.F32 q0,q0,q0
+{         //fma is coming soon, but right now:
+    __m128 res;
+    res = _mm_mul_ps (c, b);
+    return _mm_add_ps (a, res);
+}
+
+#if defined(USE_SSSE3)
+uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLA.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)         // VMLA.I8 q0,q0,q0
+{         //solution may be not optimal
+      // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c);         // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_add_epi8 (r16_1, a);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPU8_EPI16 (b16);         // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16);         // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_add_epi8(r16_2, a_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+#endif
+
+uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLA.I16 q0,q0,q0
+#define vmlaq_u16 vmlaq_s16
+
+uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLA.I32 q0,q0,q0
+#define vmlaq_u32 vmlaq_s32
+
+//**********************  Vector widening multiply accumulate (long multiply accumulate):
+//                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
+//********************************************************************************************
+
+//******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
+//********************************************************************************************
+
+#if defined(USE_SSSE3)
+int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VMLS.I8 q0,q0,q0
+{         //solution may be not optimal
+      // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c);         // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
+    r16_1 = _mm_sub_epi8 (a, r16_1);
+    //swap hi and low part of a, b, c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
+    c16 = _MM_CVTEPI8_EPI16 (c16);         // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_sub_epi8 (a_2, r16_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+#endif
+
+int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLS.I16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VMLS.I16 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_mullo_epi16 (c, b);
+    return _mm_sub_epi16 (a, res);
+}
+
+int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLS.I32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VMLS.I32 q0,q0,q0
+{
+    __m128i res;
+    res = _MM_MULLO_EPI32 (c, b);         //SSE4.1
+    return _mm_sub_epi32 (a, res);
+}
+
+float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLS.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)         // VMLS.F32 q0,q0,q0
+{
+    __m128 res;
+    res = _mm_mul_ps (c, b);
+    return _mm_sub_ps (a, res);
+}
+
+#if defined(USE_SSSE3)
+uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLS.I8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)         // VMLS.I8 q0,q0,q0
+{         //solution may be not optimal
+      // no 8 bit simd multiply, need to go to 16 bits
+    __m128i b16, c16, r16_1, a_2, r16_2;
+    _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
+    b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c);         // SSE 4.1
+    r16_1 = _mm_mullo_epi16 (b16, c16);
+    r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
+    r16_1 = _mm_sub_epi8 (a, r16_1);
+    //swap hi and low part of a, b and c to process the remaining data
+    a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
+    c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
+    b16 = _MM_CVTEPU8_EPI16 (b16);         // SSE 4.1
+    c16 = _MM_CVTEPU8_EPI16 (c16);         // SSE 4.1
+
+    r16_2 = _mm_mullo_epi16 (b16, c16);
+    r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
+    r16_2 = _mm_sub_epi8(a_2, r16_2);
+    return _mm_unpacklo_epi64(r16_1,r16_2);
+}
+#endif
+
+uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLS.I16 q0,q0,q0
+#define vmlsq_u16 vmlsq_s16
+
+uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLS.I32 q0,q0,q0
+#define vmlsq_u32 vmlsq_s32
+
+//******************** Vector multiply subtract long (widening multiply subtract) ************************************
+//*************************************************************************************************************
+
+//******  Vector saturating doubling multiply high **********************
+//*************************************************************************
+//For some ARM implementations if the multiply high result is all 0xffffffff then it is not doubled. We do the same here
+
+int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b);         // VQDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b)         // VQDMULH.S16 q0,q0,q0
+{
+    __m128i res_sat, cffff, mask, res;
+    res = _mm_mulhi_epi16 (a, b);
+    cffff = _mm_cmpeq_epi16(res,res);         //0xffff
+    mask = _mm_cmpeq_epi16(res, cffff);         //if ffff need to saturate
+    res_sat = _mm_adds_epi16(res, res);         //res *= 2 and saturate
+    return _mm_or_si128(mask, res_sat);
+}
+
+#if defined(USE_SSSE3)
+int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b);         // VQDMULH.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{         // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1;
+    ab = _mm_unpacklo_epi32 (a, b);         //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a);         //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    ab = _mm_unpackhi_epi32 (a, b);         //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a);         //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
+    mul = _mm_unpacklo_epi64(mul, mul1);
+    cffffffff = _mm_cmpeq_epi32(mul,mul);         //0xffffffff
+    mask = _mm_cmpeq_epi32(mul, cffffffff);         //if ffffffff need to saturate
+    res_sat = vqd_s32(mul);
+    return _mm_or_si128(mask, res_sat);
+}
+#endif
+
+//********* Vector saturating rounding doubling multiply high ****************
+//****************************************************************************
+//If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
+
+#if defined(USE_SSSE3)
+int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b);         // VQRDMULH.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b)         // VQRDMULH.S16 q0,q0,q0
+{
+    __m128i res_sat, cffff, mask, res;
+    res = _mm_mulhrs_epi16 (a, b);
+    cffff = _mm_cmpeq_epi16(res,res);         //0xffff
+    mask = _mm_cmpeq_epi16(res, cffff);         //if ffff need to saturate
+    res_sat = _mm_adds_epi16(res, res);         //res *= 2 and saturate
+    return _mm_or_si128(mask, res_sat);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b);         // VQRDMULH.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{         // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
+    __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1, mask1;
+    ab = _mm_unpacklo_epi32 (a, b);         //a0, b0, a1,b1
+    ba = _mm_unpacklo_epi32 (b, a);         //b0, a0, b1,a1
+    mul = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    ab = _mm_unpackhi_epi32 (a, b);         //a2, b2, a3,b3
+    ba = _mm_unpackhi_epi32 (b, a);         //b2, a2, b3,a3
+    mul1 = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
+    mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
+    mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
+    mul = _mm_unpacklo_epi64(mul, mul1);
+    cffffffff = _mm_cmpeq_epi32(mul,mul);         //0xffffffff
+    mask1 = _mm_slli_epi32(mul, 17);         //shift left then back right to
+    mask1 = _mm_srli_epi32(mul,31);         //get  15-th bit 1 or zero
+    mul = _mm_add_epi32 (mul, mask1);         //actual rounding
+    mask = _mm_cmpeq_epi32(mul, cffffffff);         //if ffffffff need to saturate
+    res_sat = vqd_s32(mul);
+    return _mm_or_si128(mask, res_sat);
+}
+#endif
+
+//*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
+//*************************************************************************************************************************
+
+//************************************************************************************
+//******************  Vector subtract ***********************************************
+//************************************************************************************
+
+int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b);         // VSUB.I8 q0,q0,q0
+#define vsubq_s8 _mm_sub_epi8
+
+int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b);         // VSUB.I16 q0,q0,q0
+#define vsubq_s16 _mm_sub_epi16
+
+int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b);         // VSUB.I32 q0,q0,q0
+#define vsubq_s32 _mm_sub_epi32
+
+int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b);         // VSUB.I64 q0,q0,q0
+#define vsubq_s64 _mm_sub_epi64
+
+float32x4_t vsubq_f32(float32x4_t a, float32x4_t b);         // VSUB.F32 q0,q0,q0
+#define vsubq_f32 _mm_sub_ps
+
+uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b);         // VSUB.I8 q0,q0,q0
+#define vsubq_u8 _mm_sub_epi8
+
+uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b);         // VSUB.I16 q0,q0,q0
+#define vsubq_u16 _mm_sub_epi16
+
+uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b);         // VSUB.I32 q0,q0,q0
+#define vsubq_u32 _mm_sub_epi32
+
+uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b);         // VSUB.I64 q0,q0,q0
+#define vsubq_u64 _mm_sub_epi64
+
+//***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
+//***********************************************************************************
+//Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
+
+//***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
+//*****************************************************************************************************
+
+//************************Vector saturating subtract *********************************
+//*************************************************************************************
+
+int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b);         // VQSUB.S8 q0,q0,q0
+#define vqsubq_s8 _mm_subs_epi8
+
+int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b);         // VQSUB.S16 q0,q0,q0
+#define vqsubq_s16 _mm_subs_epi16
+
+int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b);         // VQSUB.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
+{         //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
+    __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
+    c7fffffff = _mm_set1_epi32(0x7fffffff);
+    res = _mm_sub_epi32(a, b);
+    res_sat = _mm_srli_epi32(a, 31);
+    res_sat = _mm_add_epi32(res_sat, c7fffffff);
+    res_xor_a = _mm_xor_si128(res, a);
+    b_xor_a = _mm_xor_si128(b, a);
+    res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
+    res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
+    res_sat = _mm_and_si128(res_xor_a, res_sat);
+    res = _mm_andnot_si128(res_xor_a, res);
+    return _mm_or_si128(res, res_sat);
+}
+
+int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b);         // VQSUB.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)         //no optimal SIMD soulution
+{
+    _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    _mm_store_si128((__m128i*)btmp, b);
+    res[0] = atmp[0] - btmp[0];
+    res[1] = atmp[1] - btmp[1];
+    if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
+        res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
+    }
+    if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
+        res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b);         // VQSUB.U8 q0,q0,q0
+#define vqsubq_u8 _mm_subs_epu8
+
+uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b);         // VQSUB.s16 q0,q0,q0
+#define vqsubq_u16 _mm_subs_epu16
+
+uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b);         // VQSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b)         // VQSUB.U32 q0,q0,q0
+{
+    __m128i min, mask, sub;
+    min = _MM_MIN_EPU32(a, b);         //SSE4.1
+    mask = _mm_cmpeq_epi32 (min,  b);
+    sub = _mm_sub_epi32 (a, b);
+    return _mm_and_si128 ( sub, mask);
+}
+
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL);         // VQSUB.U64 q0,q0,q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
+    {
+        __m128i c80000000, subb, suba, cmp, sub;
+        c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
+        sub  = _mm_sub_epi64 (a, b);
+        suba = _mm_sub_epi64 (a, c80000000);
+        subb = _mm_sub_epi64 (b, c80000000);
+        cmp = _mm_cmpgt_epi64 ( suba, subb);         //no unsigned comparison, need to go to signed, SSE4.2!!!
+        return _mm_and_si128 (sub, cmp);         //saturation
+    }
+#else
+    _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+    {
+        _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
+        _mm_store_si128((__m128i*)atmp, a);
+        _mm_store_si128((__m128i*)btmp, b);
+        res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
+        res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
+        return _mm_load_si128((__m128i*)(res));
+    }
+#endif
+
+//**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
+//****************************************************************
+
+int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b);         // VHSUB.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b)         // VHSUB.S8 q0,q0,q0
+{         // //need to deal with the possibility of internal overflow
+    __m128i c128, au,bu;
+    c128 = _mm_set1_epi8 (128);
+    au = _mm_add_epi8( a, c128);
+    bu = _mm_add_epi8( b, c128);
+    return vhsubq_u8(au,bu);
+}
+
+int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b);         // VHSUB.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b)         // VHSUB.S16 q0,q0,q0
+{         //need to deal with the possibility of internal overflow
+    __m128i c8000, au,bu;
+    c8000 = _mm_set1_epi16(0x8000);
+    au = _mm_add_epi16( a, c8000);
+    bu = _mm_add_epi16( b, c8000);
+    return vhsubq_u16(au,bu);
+}
+
+int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b);         // VHSUB.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b)         // VHSUB.S32 q0,q0,q0
+{//need to deal with the possibility of internal overflow
+    __m128i a2, b2,r, b_1;
+    a2 = _mm_srai_epi32 (a,1);
+    b2 = _mm_srai_epi32 (b,1);
+    r = _mm_sub_epi32 (a2, b2);
+    b_1 = _mm_andnot_si128(a, b); //!a and b
+    b_1 = _mm_slli_epi32 (b_1,31);
+    b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
+    return _mm_sub_epi32(r,b_1);
+}
+
+uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b);         // VHSUB.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b)         // VHSUB.U8 q0,q0,q0
+{
+    __m128i avg;
+    avg = _mm_avg_epu8 (a, b);
+    return _mm_sub_epi8(a, avg);
+}
+
+uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b);         // VHSUB.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b)         // VHSUB.s16 q0,q0,q0
+{
+    __m128i avg;
+    avg = _mm_avg_epu16 (a, b);
+    return _mm_sub_epi16(a, avg);
+}
+
+uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b);         // VHSUB.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b)         // VHSUB.U32 q0,q0,q0
+{//need to deal with the possibility of internal overflow
+    __m128i a2, b2,r, b_1;
+    a2 = _mm_srli_epi32 (a,1);
+    b2 = _mm_srli_epi32 (b,1);
+    r = _mm_sub_epi32 (a2, b2);
+    b_1 = _mm_andnot_si128(a, b); //!a and b
+    b_1 = _mm_slli_epi32 (b_1,31);
+    b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
+    return _mm_sub_epi32(r,b_1);
+}
+
+//******* Vector subtract high half (truncated) ** ************
+//************************************************************
+
+//************ Vector rounding subtract high half *********************
+//*********************************************************************
+
+//*********** Vector saturating doubling multiply subtract long ********************
+//************************************************************************************
+
+//******************  COMPARISON ***************************************
+//******************* Vector compare equal *************************************
+//****************************************************************************
+
+uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b);         // VCEQ.I8 q0, q0, q0
+#define vceqq_s8 _mm_cmpeq_epi8
+
+uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b);         // VCEQ.I16 q0, q0, q0
+#define vceqq_s16 _mm_cmpeq_epi16
+
+uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b);         // VCEQ.I32 q0, q0, q0
+#define vceqq_s32 _mm_cmpeq_epi32
+
+uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b);         // VCEQ.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpeq_ps(a,b);
+    return *(__m128i*)&res;
+}
+
+uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b);         // VCEQ.I8 q0, q0, q0
+#define vceqq_u8 _mm_cmpeq_epi8
+
+uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b);         // VCEQ.I16 q0, q0, q0
+#define vceqq_u16 _mm_cmpeq_epi16
+
+uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b);         // VCEQ.I32 q0, q0, q0
+#define vceqq_u32 _mm_cmpeq_epi32
+
+uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
+#define vceqq_p8 _mm_cmpeq_epi8
+
+//******************Vector compare greater-than or equal*************************
+//*******************************************************************************
+//in IA SIMD no greater-than-or-equal comparison for integers,
+// there is greater-than available only, so we need the following tricks
+
+uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b)         // VCGE.S8 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi8 ( a, b);
+    m2 = _mm_cmpeq_epi8 ( a, b);
+    return _mm_or_si128  ( m1, m2);
+}
+
+uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b)         // VCGE.S16 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi16 ( a, b);
+    m2 = _mm_cmpeq_epi16 ( a, b);
+    return _mm_or_si128   ( m1,m2);
+}
+
+uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b)         // VCGE.S32 q0, q0, q0
+{
+    __m128i m1, m2;
+    m1 = _mm_cmpgt_epi32 (a, b);
+    m2 = _mm_cmpeq_epi32 (a, b);
+    return _mm_or_si128   (m1, m2);
+}
+
+uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpge_ps(a,b);         //use only 2 first entries
+    return *(__m128i*)&res;
+}
+
+uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b)         // VCGE.U8 q0, q0, q0
+{         //no unsigned chars comparison, only signed available,so need the trick
+    #ifdef USE_SSE4
+        __m128i cmp;
+        cmp = _mm_max_epu8(a, b);
+        return _mm_cmpeq_epi8(cmp, a);         //a>=b
+    #else
+        __m128i c128, as, bs, m1, m2;
+        c128 = _mm_set1_epi8 (128);
+        as = _mm_sub_epi8( a, c128);
+        bs = _mm_sub_epi8( b, c128);
+        m1 = _mm_cmpgt_epi8( as, bs);
+        m2 = _mm_cmpeq_epi8 (as, bs);
+        return _mm_or_si128 ( m1,  m2);
+    #endif
+}
+
+uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b)         // VCGE.s16 q0, q0, q0
+{         //no unsigned shorts comparison, only signed available,so need the trick
+    #ifdef USE_SSE4
+        __m128i cmp;
+        cmp = _mm_max_epu16(a, b);
+        return _mm_cmpeq_epi16(cmp, a);         //a>=b
+    #else
+        __m128i c8000, as, bs, m1, m2;
+        c8000 = _mm_set1_epi16 (0x8000);
+        as = _mm_sub_epi16(a,c8000);
+        bs = _mm_sub_epi16(b,c8000);
+        m1 = _mm_cmpgt_epi16(as, bs);
+        m2 = _mm_cmpeq_epi16 (as, bs);
+        return _mm_or_si128 ( m1, m2);
+    #endif
+}
+
+uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b)         // VCGE.U32 q0, q0, q0
+{         //no unsigned ints comparison, only signed available,so need the trick
+    #ifdef USE_SSE4
+        __m128i cmp;
+        cmp = _mm_max_epu32(a, b);
+        return _mm_cmpeq_epi32(cmp, a);         //a>=b
+    #else
+        //serial solution may be faster
+        __m128i c80000000, as, bs, m1, m2;
+        c80000000 = _mm_set1_epi32 (0x80000000);
+        as = _mm_sub_epi32(a,c80000000);
+        bs = _mm_sub_epi32(b,c80000000);
+        m1 = _mm_cmpgt_epi32 (as, bs);
+        m2 = _mm_cmpeq_epi32 (as, bs);
+        return _mm_or_si128 ( m1,  m2);
+    #endif
+}
+
+//**********************Vector compare less-than or equal******************************
+//***************************************************************************************
+//in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
+
+uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b)         // VCGE.S8 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi8 (a,a);         //all ones 0xff....
+    res = _mm_cmpgt_epi8 ( a,  b);
+    return _mm_andnot_si128 (res, c1);         //inverse the cmpgt result, get less-than-or-equal
+}
+
+uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b)         // VCGE.S16 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi16 (a,a);         //all ones 0xff....
+    res = _mm_cmpgt_epi16 ( a,  b);
+    return _mm_andnot_si128 (res, c1);
+}
+
+uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b)         // VCGE.S32 q0, q0, q0
+{
+    __m128i c1, res;
+    c1 = _mm_cmpeq_epi32 (a,a);         //all ones 0xff....
+    res = _mm_cmpgt_epi32 ( a,  b);
+    return _mm_andnot_si128 (res, c1);
+}
+
+uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmple_ps(a,b);
+    return *(__m128i*)&res;
+}
+
+uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b)         // VCGE.U8 q0, q0, q0
+    {         //no unsigned chars comparison in SSE, only signed available,so need the trick
+
+        __m128i cmp;
+        cmp = _mm_min_epu8(a, b);
+        return _mm_cmpeq_epi8(cmp, a);         //a<=b
+    }
+#else
+    #define vcleq_u8(a,b) vcgeq_u8(b,a)
+#endif
+
+uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.s16 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b)         // VCGE.s16 q0, q0, q0
+    {         //no unsigned shorts comparison in SSE, only signed available,so need the trick
+        __m128i cmp;
+        cmp = _mm_min_epu16(a, b);
+        return _mm_cmpeq_epi16(cmp, a);         //a<=b
+    }
+#else
+    #define vcleq_u16(a,b) vcgeq_u16(b,a)
+#endif
+
+uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
+#ifdef USE_SSE4
+    _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b)         // VCGE.U32 q0, q0, q0
+    {         //no unsigned chars comparison in SSE, only signed available,so need the trick
+        __m128i cmp;
+        cmp = _mm_min_epu32(a, b);
+        return _mm_cmpeq_epi32(cmp, a);         //a<=b
+    }
+#else
+//solution may be not optimal compared with the serial one
+    #define vcleq_u32(a,b) vcgeq_u32(b,a)
+#endif
+
+//****** Vector compare greater-than ******************************************
+//**************************************************************************
+
+uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
+#define vcgtq_s8 _mm_cmpgt_epi8
+
+uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
+#define vcgtq_s16 _mm_cmpgt_epi16
+
+uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
+#define vcgtq_s32 _mm_cmpgt_epi32
+
+uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
+{
+    __m128 res;
+    res = _mm_cmpgt_ps(a,b);         //use only 2 first entries
+    return *(__m128i*)&res;
+}
+
+uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b)         // VCGT.U8 q0, q0, q0
+{         //no unsigned chars comparison, only signed available,so need the trick
+    __m128i c128, as, bs;
+    c128 = _mm_set1_epi8 (128);
+    as = _mm_sub_epi8(a,c128);
+    bs = _mm_sub_epi8(b,c128);
+    return _mm_cmpgt_epi8 (as, bs);
+}
+
+uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.s16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b)         // VCGT.s16 q0, q0, q0
+{         //no unsigned short comparison, only signed available,so need the trick
+    __m128i c8000, as, bs;
+    c8000 = _mm_set1_epi16 (0x8000);
+    as = _mm_sub_epi16(a,c8000);
+    bs = _mm_sub_epi16(b,c8000);
+    return _mm_cmpgt_epi16 ( as, bs);
+}
+
+uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b)         // VCGT.U32 q0, q0, q0
+{         //no unsigned int comparison, only signed available,so need the trick
+    __m128i c80000000, as, bs;
+    c80000000 = _mm_set1_epi32 (0x80000000);
+    as = _mm_sub_epi32(a,c80000000);
+    bs = _mm_sub_epi32(b,c80000000);
+    return _mm_cmpgt_epi32 ( as, bs);
+}
+
+//********************* Vector compare less-than **************************
+//*************************************************************************
+
+uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
+#define vcltq_s8(a,b) vcgtq_s8(b, a)         //swap the arguments!!
+
+uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
+#define vcltq_s16(a,b) vcgtq_s16(b, a)         //swap the arguments!!
+
+uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
+#define vcltq_s32(a,b) vcgtq_s32(b, a)         //swap the arguments!!
+
+uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
+#define vcltq_f32(a,b) vcgtq_f32(b, a)         //swap the arguments!!
+
+uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
+#define vcltq_u8(a,b) vcgtq_u8(b, a)         //swap the arguments!!
+
+uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.s16 q0, q0, q0
+#define vcltq_u16(a,b) vcgtq_u16(b, a)         //swap the arguments!!
+
+uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
+#define vcltq_u32(a,b) vcgtq_u32(b, a)         //swap the arguments!!
+
+//*****************Vector compare absolute greater-than or equal ************
+//***************************************************************************
+
+uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b)         // VACGE.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmpge_ps ( a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//********Vector compare absolute less-than or equal ******************
+//********************************************************************
+
+uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b)         // VACGE.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmple_ps (a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//********  Vector compare absolute greater-than    ******************
+//******************************************************************
+
+uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b)         // VACGT.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmpgt_ps (a0, b0);
+    return (*(__m128i*)&a0);
+}
+
+//***************Vector compare absolute less-than  ***********************
+//*************************************************************************
+
+uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b)         // VACGT.F32 q0, q0, q0
+{
+    __m128i c7fffffff;
+    __m128 a0, b0;
+    c7fffffff = _mm_set1_epi32 (0x7fffffff);
+    a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
+    b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
+    a0 = _mm_cmplt_ps (a0, b0);
+    return (*(__m128i*)&a0);
+
+}
+
+//*************************Vector test bits************************************
+//*****************************************************************************
+/*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
+with the corresponding element of a second vector. If the result is not zero, the
+corresponding element in the destination vector is set to all ones. Otherwise, it is set to
+all zeros. */
+
+uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b);         // VTST.8 q0, q0, q0
+_NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b)         // VTST.8 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi8 (res, zero);
+    return _mm_xor_si128(res, one);         //invert result
+}
+
+uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b);         // VTST.16 q0, q0, q0
+_NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b)         // VTST.16 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi16 (res, zero);
+    return _mm_xor_si128(res, one);         //invert result
+}
+
+uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b);         // VTST.32 q0, q0, q0
+_NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b)         // VTST.32 q0, q0, q0
+{
+    __m128i zero, one, res;
+    zero = _mm_setzero_si128 ();
+    one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
+    res = _mm_and_si128 (a, b);
+    res =  _mm_cmpeq_epi32 (res, zero);
+    return _mm_xor_si128(res, one);         //invert result
+}
+
+uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b);         // VTST.8 q0, q0, q0
+#define vtstq_u8 vtstq_s8
+
+uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b);         // VTST.16 q0, q0, q0
+#define vtstq_u16 vtstq_s16
+
+uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b);         // VTST.32 q0, q0, q0
+#define vtstq_u32 vtstq_s32
+
+uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b);         // VTST.8 q0, q0, q0
+#define vtstq_p8 vtstq_u8
+
+//****************** Absolute difference ********************
+//*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
+//************************************************************
+#if defined(USE_SSSE3)
+
+#endif
+
+#if defined(USE_SSSE3)
+int8x16_t vabdq_s8(int8x16_t a, int8x16_t b);         // VABD.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b)         // VABD.S8 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_sub_epi8 (a, b);
+    return _mm_abs_epi8 (res);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8_t vabdq_s16(int16x8_t a, int16x8_t b);         // VABD.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b)         // VABD.S16 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_sub_epi16 (a,b);
+    return _mm_abs_epi16 (res);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int32x4_t vabdq_s32(int32x4_t a, int32x4_t b);         // VABD.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b)         // VABD.S32 q0,q0,q0
+{
+    __m128i res;
+    res = _mm_sub_epi32 (a,b);
+    return _mm_abs_epi32 (res);
+}
+#endif
+
+uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b);         // VABD.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b)         //no abs for unsigned
+{
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_u8(a,b);
+    difab = _mm_sub_epi8(a,b);
+    difba = _mm_sub_epi8 (b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b);         // VABD.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
+{
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_u16(a,b);
+    difab = _mm_sub_epi16(a,b);
+    difba = _mm_sub_epi16 (b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b);         // VABD.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
+{
+    __m128i cmp, difab, difba;
+    cmp = vcgtq_u32(a,b);
+    difab = _mm_sub_epi32(a,b);
+    difba = _mm_sub_epi32 (b,a);
+    difab = _mm_and_si128(cmp, difab);
+    difba = _mm_andnot_si128(cmp, difba);
+    return _mm_or_si128(difab, difba);
+}
+
+float32x4_t vabdq_f32(float32x4_t a, float32x4_t b);         // VABD.F32 q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b)         // VABD.F32 q0,q0,q0
+{
+    __m128i c1;
+    __m128 res;
+    c1 =  _mm_set1_epi32(0x7fffffff);
+    res = _mm_sub_ps (a, b);
+    return _mm_and_ps (res, *(__m128*)&c1);
+}
+
+//************  Absolute difference - long **************************
+//********************************************************************
+
+//**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
+//*********************************************************************************************
+
+#if defined(USE_SSSE3)
+int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VABA.S8 q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VABA.S8 q0,q0,q0
+{
+    int8x16_t sub;
+    sub = vabdq_s8(b, c);
+    return vaddq_s8( a, sub);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VABA.S16 q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VABA.S16 q0,q0,q0
+{
+    int16x8_t sub;
+    sub = vabdq_s16(b, c);
+    return vaddq_s16( a, sub);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VABA.S32 q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VABA.S32 q0,q0,q0
+{
+    int32x4_t sub;
+    sub = vabdq_s32(b, c);
+    return vaddq_s32( a, sub);
+}
+#endif
+
+uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VABA.U8 q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
+{
+    uint8x16_t sub;
+    sub = vabdq_u8(b, c);
+    return vaddq_u8( a, sub);
+}
+
+uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VABA.s16 q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
+{
+    uint16x8_t sub;
+    sub = vabdq_u16(b, c);
+    return vaddq_u16( a, sub);
+}
+
+uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VABA.U32 q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
+{
+    uint32x4_t sub;
+    sub = vabdq_u32(b, c);
+    return vaddq_u32( a, sub);
+}
+
+//************** Absolute difference and accumulate - long ********************************
+//*************************************************************************************
+
+//***********************************************************************************
+//****************  Maximum and minimum operations **********************************
+//***********************************************************************************
+//************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
+//***********************************************************************************
+
+int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b);         // VMAX.S8 q0,q0,q0
+#define vmaxq_s8 _MM_MAX_EPI8         //SSE4.1
+
+int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b);         // VMAX.S16 q0,q0,q0
+#define vmaxq_s16 _mm_max_epi16
+
+int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b);         // VMAX.S32 q0,q0,q0
+#define vmaxq_s32 _MM_MAX_EPI32         //SSE4.1
+
+uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b);         // VMAX.U8 q0,q0,q0
+#define vmaxq_u8 _mm_max_epu8
+
+uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b);         // VMAX.s16 q0,q0,q0
+#define vmaxq_u16 _MM_MAX_EPU16         //SSE4.1
+
+uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b);         // VMAX.U32 q0,q0,q0
+#define vmaxq_u32 _MM_MAX_EPU32         //SSE4.1
+
+float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b);         // VMAX.F32 q0,q0,q0
+#define vmaxq_f32 _mm_max_ps
+
+//*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
+//***********************************************************************************************************
+
+int8x16_t   vminq_s8(int8x16_t a, int8x16_t b);         // VMIN.S8 q0,q0,q0
+#define vminq_s8 _MM_MIN_EPI8         //SSE4.1
+
+int16x8_t   vminq_s16(int16x8_t a, int16x8_t b);         // VMIN.S16 q0,q0,q0
+#define vminq_s16 _mm_min_epi16
+
+int32x4_t   vminq_s32(int32x4_t a, int32x4_t b);         // VMIN.S32 q0,q0,q0
+#define vminq_s32 _MM_MIN_EPI32         //SSE4.1
+
+uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b);         // VMIN.U8 q0,q0,q0
+#define vminq_u8 _mm_min_epu8
+
+uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b);         // VMIN.s16 q0,q0,q0
+#define vminq_u16 _MM_MIN_EPU16         //SSE4.1
+
+uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b);         // VMIN.U32 q0,q0,q0
+#define vminq_u32 _MM_MIN_EPU32         //SSE4.1
+
+float32x4_t vminq_f32(float32x4_t a, float32x4_t b);         // VMIN.F32 q0,q0,q0
+#define vminq_f32 _mm_min_ps
+
+//*************  Pairwise addition operations. **************************************
+//************************************************************************************
+//Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
+
+//**************************  Long pairwise add  **********************************
+//*********************************************************************************
+//Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
+// and places the final results in the destination vector.
+
+#if defined(USE_SSSE3)
+int16x8_t vpaddlq_s8(int8x16_t a);         // VPADDL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a)         // VPADDL.S8 q0,q0
+{         //no 8 bit hadd in IA32, need to go to 16 bit
+    __m128i r16_1, r16_2;
+    r16_1 = _MM_CVTEPI8_EPI16 (a);         // SSE 4.1
+    //swap hi and low part of r to process the remaining data
+    r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
+    return _mm_hadd_epi16 (r16_1, r16_2);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int32x4_t vpaddlq_s16(int16x8_t a);         // VPADDL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a)         // VPADDL.S16 q0,q0
+{         //no 8 bit hadd in IA32, need to go to 16 bit
+    __m128i r32_1, r32_2;
+    r32_1 = _MM_CVTEPI16_EPI32(a);
+    //swap hi and low part of r to process the remaining data
+    r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
+    return _mm_hadd_epi32 (r32_1, r32_2);
+}
+#endif
+
+int64x2_t vpaddlq_s32(int32x4_t a);         // VPADDL.S32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)         // VPADDL.S32 q0,q0
+{
+    _NEON2SSE_ALIGN_16 int32_t atmp[4];
+    _NEON2SSE_ALIGN_16 int64_t res[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
+    res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
+    return _mm_load_si128((__m128i*)res);
+}
+
+#if defined(USE_SSSE3)
+uint16x8_t vpaddlq_u8(uint8x16_t a);         // VPADDL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a)         // VPADDL.U8 q0,q0
+{         //no 8 bit hadd in IA32, need to go to 16 bit
+    __m128i r16_1, r16_2;
+    r16_1 = _MM_CVTEPU8_EPI16(a);
+    //swap hi and low part of r to process the remaining data
+    r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
+    return _mm_hadd_epi16 (r16_1, r16_2);
+}
+#endif
+
+uint32x4_t vpaddlq_u16(uint16x8_t a);         // VPADDL.s16 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
+{         //serial solution looks faster than a SIMD one
+    _NEON2SSE_ALIGN_16 uint16_t atmp[8];
+    _NEON2SSE_ALIGN_16 uint32_t res[4];
+    _mm_store_si128((__m128i*)atmp, a);
+    res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
+    res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
+    res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
+    res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
+    return _mm_load_si128((__m128i*)res);
+}
+
+uint64x2_t vpaddlq_u32(uint32x4_t a);         // VPADDL.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    _NEON2SSE_ALIGN_16 uint32_t atmp[4];
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    _mm_store_si128((__m128i*)atmp, a);
+    res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
+    res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
+    return _mm_load_si128((__m128i*)res);
+}
+
+//************************  Long pairwise add and accumulate **************************
+//****************************************************************************************
+//VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
+// and accumulates the  values of the results into the elements of the destination (wide) vector
+
+#if defined(USE_SSSE3)
+int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b);         // VPADAL.S8 q0,q0
+_NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b)         // VPADAL.S8 q0,q0
+{
+    int16x8_t pad;
+    pad = vpaddlq_s8(b);
+    return _mm_add_epi16 (a, pad);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b);         // VPADAL.S16 q0,q0
+_NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b)         // VPADAL.S16 q0,q0
+{
+    int32x4_t pad;
+    pad = vpaddlq_s16(b);
+    return _mm_add_epi32(a, pad);
+}
+#endif
+
+int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b);         // VPADAL.S32 q0,q0
+_NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
+{
+    int64x2_t pad;
+    pad = vpaddlq_s32(b);
+    return _mm_add_epi64 (a, pad);
+}
+
+#if defined(USE_SSSE3)
+uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b);         // VPADAL.U8 q0,q0
+_NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b)         // VPADAL.U8 q0,q0
+{
+    uint16x8_t pad;
+    pad = vpaddlq_u8(b);
+    return _mm_add_epi16 (a, pad);
+}
+#endif
+
+uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b);         // VPADAL.s16 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    uint32x4_t pad;
+    pad = vpaddlq_u16(b);
+    return _mm_add_epi32(a, pad);
+}         //no optimal SIMD solution, serial is faster
+
+uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b);         // VPADAL.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
+{         //no optimal SIMD solution, serial is faster
+    uint64x2_t pad;
+    pad = vpaddlq_u32(b);
+    return _mm_add_epi64(a, pad);
+}         //no optimal SIMD solution, serial is faster
+
+//**********  Folding maximum   *************************************
+//*******************************************************************
+//VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
+//and copies the larger of each pair into the corresponding element in the destination
+//    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
+
+// ***************** Folding minimum  ****************************
+// **************************************************************
+//vpmin -> takes minimum of adjacent pairs
+
+//***************************************************************
+//***********  Reciprocal/Sqrt ************************************
+//***************************************************************
+//****************** Reciprocal estimate *******************************
+
+//the ARM NEON and x86 SIMD results may be slightly different
+
+float32x4_t vrecpeq_f32(float32x4_t a);         // VRECPE.F32 q0,q0
+//the ARM NEON and x86 SIMD results may be slightly different
+#define vrecpeq_f32 _mm_rcp_ps
+
+uint32x4_t vrecpeq_u32(uint32x4_t a);         // VRECPE.U32 q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
+{         //no reciprocal for ints in IA32 available, neither for  unsigned int to float 4 lanes conversion, so serial solution looks faster
+    _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
+    _mm_store_si128((__m128i*)atmp, a);
+    res[0] = (atmp[0]) ? 1 / atmp[0] : 0xffffffff;
+    res[1] = (atmp[1]) ? 1 / atmp[1] : 0xffffffff;
+    return _mm_load_si128((__m128i*)res);
+}
+
+//**********Reciprocal square root estimate ****************
+//**********************************************************
+//no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
+
+float32x4_t vrsqrteq_f32(float32x4_t a);         // VRSQRTE.F32 q0,q0
+//the ARM NEON and x86 SIMD results may be slightly different
+#define vrsqrteq_f32 _mm_rsqrt_ps
+
+uint32x4_t vrsqrteq_u32(uint32x4_t a);         // VRSQRTE.U32 q0,q0
+#define vrsqrteq_u32(a) _mm_castps_si128(_mm_rsqrt_ps(_M128(a)) )
+
+//************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
+//******************************************************************************************
+//******VRECPS (Vector Reciprocal Step) ***************************************************
+//multiplies the elements of one vector by the corresponding elements of another vector,
+//subtracts each of the results from 2, and places the final results into the elements of the destination vector.
+
+float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b);         // VRECPS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b)         // VRECPS.F32 q0, q0, q0
+{
+    __m128 f2, mul;
+    f2 =  _mm_set1_ps(2.);
+    mul = _mm_mul_ps(a,b);
+    return _mm_sub_ps(f2,mul);
+}
+
+//*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
+//multiplies the elements of one vector by the corresponding elements of another vector,
+//subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
+
+float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b);         // VRSQRTS.F32 q0, q0, q0
+_NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b)         // VRSQRTS.F32 q0, q0, q0
+{
+    __m128 f3, f05, mul;
+    f3 =  _mm_set1_ps(3.);
+    f05 =  _mm_set1_ps(0.5);
+    mul = _mm_mul_ps(a,b);
+    f3 = _mm_sub_ps(f3,mul);
+    return _mm_mul_ps (f3, f05);
+}
+//********************************************************************************************
+//***************************** Shifts by signed variable ***********************************
+//********************************************************************************************
+//***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
+//********************************************************************************************
+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
+//helper macro. It matches ARM implementation for big shifts
+#define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
+        else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
+        return _mm_load_si128((__m128i*)res);
+
+int8x16_t vshlq_s8(int8x16_t a, int8x16_t b);         // VSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int8_t, int8_t, 16, 16)
+}
+
+int16x8_t vshlq_s16(int16x8_t a, int16x8_t b);         // VSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int16_t, int16_t, 8, 8)
+}
+
+int32x4_t vshlq_s32(int32x4_t a, int32x4_t b);         // VSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int32_t, int32_t, 4, 4)
+}
+
+int64x2_t vshlq_s64(int64x2_t a, int64x2_t b);         // VSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(int64_t, int64_t, 2, 2)
+}
+
+uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b);         // VSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
+}
+
+uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b);         // VSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
+}
+
+uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b);         // VSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
+}
+
+uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b);         // VSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
+}
+
+//*********** Vector saturating shift left: (negative values shift right) **********************
+//********************************************************************************************
+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
+#define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
+        int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if (atmp[i] ==0) res[i] = 0; \
+        else{ \
+            if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
+            else{ \
+                if (btmp[i]>lanesize_1) { \
+                    res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
+                    if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
+                        res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                    else res[i] = atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
+        TYPE lanesize = (sizeof(TYPE) << 3); \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if (atmp[i] ==0) {res[i] = 0; \
+        }else{ \
+            if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
+            else{ \
+                if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
+                else{ \
+                    limit = (TYPE) 1 << (lanesize - btmp[i]); \
+                    res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b);         // VQSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
+}
+
+int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b);         // VQSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
+}
+
+int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b);         // VQSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
+}
+
+int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b);         // VQSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
+}
+
+uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b);         // VQSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
+}
+
+uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b);         // VQSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
+}
+
+uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b);         // VQSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
+}
+
+uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b);         // VQSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
+}
+
+//******** Vector rounding shift left: (negative values shift right) **********
+//****************************************************************************
+//No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
+//rounding makes sense for right shifts only.
+#define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if( btmp[i] >= 0) { \
+            if(btmp[i] >= lanesize) res[i] = 0; \
+            else res[i] = (atmp[i] << btmp[i]); \
+        }else{ \
+            res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
+                            (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
+                            (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
+        return _mm_load_si128((__m128i*)res);
+
+int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b);         // VRSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
+}
+
+int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b);         // VRSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
+}
+
+int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b);         // VRSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
+}
+
+int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b);         // VRSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
+}
+
+uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b);         // VRSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
+}
+
+uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b);         // VRSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
+}
+
+uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b);         // VRSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
+}
+
+uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b);         // VRSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
+}
+
+//********** Vector saturating rounding shift left: (negative values shift right) ****************
+//*************************************************************************************************
+//No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
+//Saturation happens for left shifts only while rounding makes sense for right shifts only.
+#define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
+        int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if (atmp[i] ==0) res[i] = 0; \
+        else{ \
+            if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
+            else{ \
+                if (btmp[i]>lanesize_1) { \
+                    res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                }else{ \
+                    limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
+                    if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
+                        res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
+                    else res[i] = atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+#define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
+        _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
+        int lanesize = (sizeof(TYPE) << 3); \
+        _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
+        for (i = 0; i<LEN; i++) { \
+        if (atmp[i] ==0) {res[i] = 0; \
+        }else{ \
+            if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
+            else{ \
+                if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
+                else{ \
+                    limit = (TYPE) 1 << (lanesize - btmp[i]); \
+                    res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
+        return _mm_load_si128((__m128i*)res);
+
+int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b);         // VQRSHL.S8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
+}
+
+int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b);         // VQRSHL.S16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
+}
+
+int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b);         // VQRSHL.S32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
+}
+
+int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b);         // VQRSHL.S64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
+}
+
+uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b);         // VQRSHL.U8 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
+}
+
+uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b);         // VQRSHL.s16 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
+}
+
+uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b);         // VQRSHL.U32 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
+}
+
+uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);         // VQRSHL.U64 q0,q0,q0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{
+    SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
+}
+
+// *********************************************************************************
+// *****************************  Shifts by a constant *****************************
+// *********************************************************************************
+//**************** Vector shift right by constant*************************************
+//************************************************************************************
+
+int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b)         // VSHR.S8 q0,q0,#8
+{         //no 8 bit shift available, go to 16 bit trick
+    __m128i zero, mask0, a_sign, r, a_sign_mask;
+    _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
+    zero = _mm_setzero_si128();
+    mask0 = _mm_set1_epi16(mask0_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
+    a_sign =  _mm_cmpgt_epi8 (zero, a);         //ff if a<0 or zero if a>0
+    r = _mm_srai_epi16 (a, b);
+    a_sign_mask =  _mm_and_si128 (mask0, a_sign);
+    r =  _mm_andnot_si128 (mask0, r);
+    return _mm_or_si128 (r, a_sign_mask);
+}
+
+int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VSHR.S16 q0,q0,#16
+#define vshrq_n_s16 _mm_srai_epi16
+
+int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VSHR.S32 q0,q0,#32
+#define vshrq_n_s32 _mm_srai_epi32
+
+int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VSHR.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
+{         //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
+    __m128i c1, signmask,a0,  res64;
+    _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
+    c1 =  _mm_cmpeq_epi32(a,a);         //0xffffffffffffffff
+    signmask  =  _mm_slli_epi64 (c1, (64 - b));
+    a0 = _mm_or_si128(a, *(__m128i*)mask);         //get the first bit
+    #ifdef USE_SSE4
+        a0 = _mm_cmpeq_epi64 (a, a0);         //SSE4.1
+    #else
+        a0 = _mm_cmpeq_epi32 (a, a0);
+        a0 = _mm_shuffle_epi32 (a0, 1 | (1 << 2) | (3 << 4) | (3 << 6));         //copy the information from hi to low part of the 64 bit data
+    #endif
+    signmask = _mm_and_si128(a0, signmask);
+    res64 = _mm_srli_epi64 (a, b);
+    return _mm_or_si128(res64, signmask);
+}
+
+uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b)         // VSHR.U8 q0,q0,#8
+{         //no 8 bit shift available, need the special trick
+    __m128i mask0, r;
+    _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
+    mask0 = _mm_set1_epi16(mask10_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
+    r = _mm_srli_epi16 ( a, b);
+    return _mm_and_si128 (r,  mask0);
+}
+
+uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VSHR.s16 q0,q0,#16
+#define vshrq_n_u16 _mm_srli_epi16
+
+uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VSHR.U32 q0,q0,#32
+#define vshrq_n_u32 _mm_srli_epi32
+
+uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VSHR.U64 q0,q0,#64
+#define vshrq_n_u64 _mm_srli_epi64
+
+//*************************** Vector shift left by constant *************************
+//*********************************************************************************
+
+int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
+#define vshlq_n_s8 vshlq_n_u8
+
+int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
+#define vshlq_n_s16 _mm_slli_epi16
+
+int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
+#define vshlq_n_s32 _mm_slli_epi32
+
+int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+#define vshlq_n_s64 _mm_slli_epi64
+
+uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
+{         //no 8 bit shift available, need the special trick
+    __m128i mask0, r;
+    _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
+    mask0 = _mm_set1_epi16(mask10_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
+    r = _mm_slli_epi16 ( a, b);
+    return _mm_and_si128 (r,  mask0);
+}
+
+uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
+#define vshlq_n_u16 vshlq_n_s16
+
+uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
+#define vshlq_n_u32 vshlq_n_s32
+
+uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
+#define vshlq_n_u64 vshlq_n_s64
+
+//************* Vector rounding shift right by constant ******************
+//*************************************************************************
+//No corresponding  x86 intrinsics exist, need to do some tricks
+
+int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VRSHR.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b)         // VRSHR.S8 q0,q0,#8
+{         //no 8 bit shift available, go to 16 bit trick
+    __m128i r, mask1, maskb;
+    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};         // 2^b-th bit set to 1
+    r = vshrq_n_s8 (a, b);
+    mask1 = _mm_set1_epi16(mask2b[b]);         // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1);         //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1);         // to add 1
+    return _mm_add_epi8(r, maskb);         //actual rounding
+}
+
+int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VRSHR.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b)         // VRSHR.S16 q0,q0,#16
+{
+    __m128i maskb, r;
+    maskb =  _mm_slli_epi16(a, (16 - b));         //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15);         //1 or 0
+    r = _mm_srai_epi16 (a, b);
+    return _mm_add_epi16 (r, maskb);         //actual rounding
+}
+
+int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VRSHR.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b)         // VRSHR.S32 q0,q0,#32
+{
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi32 (a, (32 - b));         //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31);         //1 or 0
+    r = _mm_srai_epi32(a, b);
+    return _mm_add_epi32 (r, maskb);         //actual rounding
+}
+
+int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VRSHR.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
+{         //solution may be not optimal compared with a serial one
+    __m128i maskb;
+    int64x2_t r;
+    maskb = _mm_slli_epi64 (a, (64 - b));         //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63);         //1 or 0
+    r = vshrq_n_s64(a, b);
+    return _mm_add_epi64 (r, maskb);         //actual rounding
+}
+
+uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VRSHR.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b)         // VRSHR.U8 q0,q0,#8
+{         //no 8 bit shift available, go to 16 bit trick
+    __m128i r, mask1, maskb;
+    _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};         // 2^b-th bit set to 1
+    r = vshrq_n_u8 (a, b);
+    mask1 = _mm_set1_epi16(mask2b[b]);         // 2^b-th bit set to 1 for 16bit, need it for rounding
+    maskb = _mm_and_si128(a, mask1);         //get b or 0 for rounding
+    maskb =  _mm_srli_epi16 (maskb, b - 1);         // to add 1
+    return _mm_add_epi8(r, maskb);         //actual rounding
+}
+
+uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VRSHR.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b)         // VRSHR.S16 q0,q0,#16
+{
+    __m128i maskb, r;
+    maskb =  _mm_slli_epi16(a, (16 - b));         //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi16(maskb, 15);         //1 or 0
+    r = _mm_srli_epi16 (a, b);
+    return _mm_add_epi16 (r, maskb);         //actual rounding
+}
+
+uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VRSHR.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b)         // VRSHR.S32 q0,q0,#32
+{
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi32 (a, (32 - b));         //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi32 (maskb,31);         //1 or 0
+    r = _mm_srli_epi32(a, b);
+    return _mm_add_epi32 (r, maskb);         //actual rounding
+}
+
+uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VRSHR.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
+{         //solution may be not optimal compared with a serial one
+    __m128i maskb,  r;
+    maskb = _mm_slli_epi64 (a, (64 - b));         //to get rounding (b-1)th bit
+    maskb = _mm_srli_epi64 (maskb,63);         //1 or 0
+    r = _mm_srli_epi64(a, b);
+    return _mm_add_epi64 (r, maskb);         //actual rounding
+}
+
+//************* Vector shift right by constant and accumulate *********
+//*********************************************************************
+
+int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VSRA.S8 q0,q0,#8
+{
+    int8x16_t shift;
+    shift = vshrq_n_s8(b, c);
+    return vaddq_s8(a, shift);
+}
+
+int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VSRA.S16 q0,q0,#16
+{
+    int16x8_t shift;
+    shift = vshrq_n_s16(b, c);
+    return vaddq_s16(a, shift);
+}
+
+int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VSRA.S32 q0,q0,#32
+{
+    int32x4_t shift;
+    shift = vshrq_n_s32(b, c);
+    return vaddq_s32(a, shift);
+}
+
+int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRA.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)         // VSRA.S64 q0,q0,#64
+{
+    int64x2_t shift;
+    shift = vshrq_n_s64(b, c);
+    return vaddq_s64( a, shift);
+}
+
+uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c)         // VSRA.U8 q0,q0,#8
+{
+    uint8x16_t shift;
+    shift = vshrq_n_u8(b, c);
+    return vaddq_u8(a, shift);
+}
+
+uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c)         // VSRA.s16 q0,q0,#16
+{
+    uint16x8_t shift;
+    shift = vshrq_n_u16(b, c);
+    return vaddq_u16(a,  shift);
+}
+
+uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c)         // VSRA.U32 q0,q0,#32
+{
+    uint32x4_t shift;
+    shift = vshrq_n_u32(b, c);
+    return vaddq_u32(a, shift);
+}
+
+uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRA.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)         // VSRA.U64 q0,q0,#64
+{
+    uint64x2_t shift;
+    shift = vshrq_n_u64(b, c);
+    return vaddq_u64(a, shift);
+}
+
+//************* Vector rounding shift right by constant and accumulate ****************************
+//************************************************************************************************
+
+int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VRSRA.S8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VRSRA.S8 q0,q0,#8
+{
+    int8x16_t shift;
+    shift = vrshrq_n_s8(b, c);
+    return vaddq_s8(a, shift);
+}
+
+int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VRSRA.S16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VRSRA.S16 q0,q0,#16
+{
+    int16x8_t shift;
+    shift = vrshrq_n_s16(b, c);
+    return vaddq_s16(a, shift);
+}
+
+int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VRSRA.S32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VRSRA.S32 q0,q0,#32
+{
+    int32x4_t shift;
+    shift = vrshrq_n_s32(b, c);
+    return vaddq_s32(a, shift);
+}
+
+int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VRSRA.S64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
+{
+    int64x2_t shift;
+    shift = vrshrq_n_s64(b, c);
+    return vaddq_s64(a, shift);
+}
+
+uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VRSRA.U8 q0,q0,#8
+_NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c)         // VRSRA.U8 q0,q0,#8
+{
+    uint8x16_t shift;
+    shift = vrshrq_n_u8(b, c);
+    return vaddq_u8(a, shift);
+}
+
+uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VRSRA.s16 q0,q0,#16
+_NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c)         // VRSRA.s16 q0,q0,#16
+{
+    uint16x8_t shift;
+    shift = vrshrq_n_u16(b, c);
+    return vaddq_u16(a,  shift);
+}
+
+uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VRSRA.U32 q0,q0,#32
+_NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c)         // VRSRA.U32 q0,q0,#32
+{
+    uint32x4_t shift;
+    shift = vrshrq_n_u32(b, c);
+    return vaddq_u32(a, shift);
+}
+
+uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VRSRA.U64 q0,q0,#64
+_NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
+{
+    uint64x2_t shift;
+    shift = vrshrq_n_u64(b, c);
+    return vaddq_u64(a, shift);
+}
+
+//**********************Vector saturating shift left by constant *****************************
+//********************************************************************************************
+//we don't check const ranges  assuming they are met
+
+int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHL.S8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b)         // VQSHL.S8 q0,q0,#0
+{         // go to 16 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI8_EPI16 (a);         //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packs_epi16 (r128_1, r128_2);         //saturated s8
+}
+
+int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHL.S16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b)         // VQSHL.S16 q0,q0,#0
+{         // manual saturation solution looks LESS optimal than 32 bits conversion one
+      // go to 32 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI16_EPI32 (a);         //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b);         //shift_res
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI16_EPI32 (a128);
+    r128_2 = _mm_slli_epi32 (a128, b);
+    return _mm_packs_epi32 (r128_1, r128_2);         //saturated s16
+}
+
+int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHL.S32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b)         // VQSHL.S32 q0,q0,#0
+{         // no 64 bit saturation option available, special tricks necessary
+    __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
+    c1 = _mm_cmpeq_epi32(a,a);         //0xff..ff
+    maskA = _mm_srli_epi32(c1, b + 1);         //mask for positive numbers (32-b+1) zeros and b-1 ones
+    saturation_mask = _mm_cmpgt_epi32 (a, maskA);         //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1);         //saturated to 0x7f..ff when needed and zeros if not
+    shift_res = _mm_slli_epi32 (a, b);
+    shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
+    //result with positive numbers saturated
+    shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
+    //treat negative numbers
+    maskA = _mm_slli_epi32(c1, 31 - b);         //mask for negative numbers b-1 ones  and (32-b+1)  zeros
+    saturation_mask = _mm_cmpgt_epi32 (maskA,a);         //0xff...ff if we need saturation, 0  otherwise
+    c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31);         //saturated to 0x80..00 when needed and zeros if not
+    shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
+    return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
+}
+
+int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHL.S64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{         // no effective SIMD solution here
+    _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
+    int64_t bmask;
+    int i;
+    bmask = ( int64_t)1 << (63 - b);         //positive
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        if (atmp[i] >= bmask) {
+            res[i] = ~(_SIGNBIT64);
+        } else {
+            res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
+        }
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VQSHL.U8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)         // VQSHL.U8 q0,q0,#0
+{         // go to 16 bit to get the auto saturation (in packs function)
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPU8_EPI16 (a);         //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPU8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packus_epi16 (r128_1, r128_2);         //saturated u8
+}
+
+uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VQSHL.s16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b)         // VQSHL.s16 q0,q0,#0
+{         // manual saturation solution looks more optimal than 32 bits conversion one
+    __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
+    cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
+    c8000 = _mm_set1_epi16 (0x8000);
+//no unsigned shorts comparison in SSE, only signed available, so need the trick
+    a_signed = _mm_sub_epi16(a, c8000);         //go to signed
+    saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
+    shift_res = _mm_slli_epi16 (a, b);
+    return _mm_or_si128 (shift_res, saturation_mask);
+}
+
+uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VQSHL.U32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b)         // VQSHL.U32 q0,q0,#0
+{         // manual saturation solution, no 64 bit saturation option, the serial version may be faster
+    __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
+    cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
+    c80000000 = _mm_set1_epi32 (0x80000000);
+//no unsigned ints comparison in SSE, only signed available, so need the trick
+    a_signed = _mm_sub_epi32(a, c80000000);         //go to signed
+    saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
+    shift_res = _mm_slli_epi32 (a, b);
+    return _mm_or_si128 (shift_res, saturation_mask);
+}
+
+uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VQSHL.U64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
+{         // no effective SIMD solution here
+    _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
+    uint64_t bmask;
+    int i;
+    bmask = ( uint64_t)1 << (64 - b);
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b;         //if b=0 we are fine with any a
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+//**************Vector signed->unsigned saturating shift left by constant *************
+//*************************************************************************************
+
+uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHLU.S8 q0,q0,#0
+_NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b)         // VQSHLU.S8 q0,q0,#0
+{
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI8_EPI16 (a);         //SSE 4.1
+    r128_1 = _mm_slli_epi16 (a128, b);
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI8_EPI16 (a128);
+    r128_2 = _mm_slli_epi16 (a128, b);
+    return _mm_packus_epi16 (r128_1, r128_2);         //saturated u8
+}
+
+#if defined(USE_SSSE3)
+uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHLU.S16 q0,q0,#0
+_NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b)         // VQSHLU.S16 q0,q0,#0
+{         // manual saturation solution looks LESS optimal than 32 bits conversion one
+    __m128i a128, r128_1, r128_2;
+    a128 = _MM_CVTEPI16_EPI32 (a);         //SSE 4.1
+    r128_1 = _mm_slli_epi32 (a128, b);         //shift_res
+    //swap hi and low part of a128 to process the remaining data
+    a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
+    a128 = _MM_CVTEPI16_EPI32 (a128);
+    r128_2 = _mm_slli_epi32 (a128, b);
+    return _MM_PACKUS_EPI32 (r128_1, r128_2);         //saturated s16
+}
+#endif
+
+uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHLU.S32 q0,q0,#0
+_NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b)         // VQSHLU.S32 q0,q0,#0
+{         //solution may be  not optimal compared with the serial one
+    __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
+    zero = _mm_setzero_si128();
+    maskA = _mm_cmpeq_epi32(a, a);
+    maskA = _mm_slli_epi32(maskA,(32 - b));         // b ones and (32-b)zeros
+    //saturate negative numbers to zero
+    maskGT0   = _mm_cmpgt_epi32 (a, zero);         // //0xffffffff if positive number and zero otherwise (negative numbers)
+    a0 = _mm_and_si128 (a,  maskGT0);         //negative are zeros now
+    //saturate positive to 0xffffffff
+    a_masked = _mm_and_si128 (a0, maskA);
+    a_masked = _mm_cmpgt_epi32 (a_masked, zero);         //0xffffffff if saturation necessary 0 otherwise
+    a_shift = _mm_slli_epi32 (a0, b);
+    return _mm_or_si128 (a_shift, a_masked);         //actual saturation
+}
+
+uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHLU.S64 q0,q0,#0
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
+{         // no effective SIMD solution here, serial execution looks faster
+    _NEON2SSE_ALIGN_16 int64_t atmp[2];
+    _NEON2SSE_ALIGN_16 uint64_t res[2];
+    uint64_t limit;
+    int i;
+    _mm_store_si128((__m128i*)atmp, a);
+    for (i = 0; i<2; i++) {
+        if (atmp[i]<=0) {
+            res[i] = 0;
+        } else {
+            limit = (uint64_t) 1 << (64 - b);
+            res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
+        }
+    }
+    return _mm_load_si128((__m128i*)res);
+}
+
+//************** Vector narrowing  shift right by constant **************
+//**********************************************************************
+
+//************** Vector signed->unsigned narrowing saturating shift right by constant ********
+//*********************************************************************************************
+
+//**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
+
+//***** Vector narrowing saturating shift right by constant ******
+//*****************************************************************
+
+//********* Vector rounding narrowing shift right by constant *************************
+//****************************************************************************************
+
+//************* Vector rounding narrowing saturating shift right by constant ************
+//****************************************************************************************
+
+//************** Vector widening shift left by constant ****************
+//************************************************************************
+
+//************************************************************************************
+//**************************** Shifts with insert ************************************
+//************************************************************************************
+//takes each element in a vector,  shifts them by an immediate value,
+//and inserts the results in the destination vector. Bits shifted out of the each element are lost.
+
+//**************** Vector shift right and insert ************************************
+//Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
+//All other bits are taken from b shifted.
+
+int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+_NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VSRI.8 q0,q0,#8
+{
+    __m128i maskA, a_masked;
+    uint8x16_t b_shift;
+    _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};         //"a" bits mask, 0 bit not used
+    maskA = _mm_set1_epi8(maskLeft[c]);         // c ones and (8-c)zeros
+    a_masked = _mm_and_si128 (a, maskA);
+    b_shift = vshrq_n_u8( b, c);         // c zeros on the left in b due to logical shift
+    return _mm_or_si128 (a_masked, b_shift);         //combine (insert b into a)
+}
+
+int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+_NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VSRI.16 q0,q0,#16
+{         //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+    uint16x8_t b_shift;
+    uint16x8_t a_c;
+    b_shift = vshrq_n_u16( b, c);         // c zeros on the left in b due to logical shift
+    a_c = vshrq_n_u16( a, (16 - c));
+    a_c  = _mm_slli_epi16(a_c, (16 - c));         //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+}
+
+int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
+_NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VSRI.32 q0,q0,#32
+{         //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
+    uint32x4_t b_shift;
+    uint32x4_t a_c;
+    b_shift = vshrq_n_u32( b, c);         // c zeros on the left in b due to logical shift
+    a_c = vshrq_n_u32( a, (32 - c));
+    a_c  = _mm_slli_epi32(a_c, (32 - c));         //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+}
+
+int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
+_NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
+{         //serial solution may be faster
+    uint64x2_t b_shift;
+    uint64x2_t a_c;
+    b_shift = _mm_srli_epi64(b, c);         // c zeros on the left in b due to logical shift
+    a_c = _mm_srli_epi64(a, (64 - c));
+    a_c  = _mm_slli_epi64(a_c, (64 - c));         //logical shift provides right "c" bits zeros in a
+    return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
+}
+
+uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+#define vsriq_n_u8 vsriq_n_s8
+
+uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+#define vsriq_n_u16 vsriq_n_s16
+
+uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
+#define vsriq_n_u32 vsriq_n_s32
+
+uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
+#define vsriq_n_u64 vsriq_n_s64
+
+poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
+#define vsriq_n_p8 vsriq_n_u8
+
+poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
+#define vsriq_n_p16 vsriq_n_u16
+
+//***** Vector shift left and insert *********************************************
+//*********************************************************************************
+//Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
+//All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
+
+int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+_NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c)         // VSLI.8 q0,q0,#0
+{
+    __m128i maskA, a_masked;
+    int8x16_t b_shift;
+    _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f};         //"a" bits mask
+    maskA = _mm_set1_epi8(maskRight[c]);         // (8-c)zeros and c ones
+    b_shift = vshlq_n_s8( b, c);
+    a_masked = _mm_and_si128 (a, maskA);
+    return _mm_or_si128 (b_shift, a_masked);         //combine (insert b into a)
+}
+
+int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+_NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c)         // VSLI.16 q0,q0,#0
+{         //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
+    int16x8_t b_shift;
+    int16x8_t a_c;
+    b_shift = vshlq_n_s16( b, c);
+    a_c = vshlq_n_s16( a, (16 - c));
+    a_c  = _mm_srli_epi16(a_c, (16 - c));
+    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+}
+
+int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
+_NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c)         // VSLI.32 q0,q0,#0
+{         //solution may be  not optimal compared with the serial one
+      //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
+    int32x4_t b_shift;
+    int32x4_t a_c;
+    b_shift = vshlq_n_s32( b, c);
+    a_c = vshlq_n_s32( a, (32 - c));
+    a_c  = _mm_srli_epi32(a_c, (32 - c));
+    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+}
+
+int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
+_NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c)         // VSLI.64 q0,q0,#0
+{         //solution may be  not optimal compared with the serial one
+      //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
+    int64x2_t b_shift;
+    int64x2_t a_c;
+    b_shift = vshlq_n_s64( b, c);
+    a_c = vshlq_n_s64( a, (64 - c));
+    a_c  = _mm_srli_epi64(a_c, (64 - c));
+    return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
+}
+
+uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+#define vsliq_n_u8 vsliq_n_s8
+
+uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+#define vsliq_n_u16 vsliq_n_s16
+
+uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
+#define vsliq_n_u32 vsliq_n_s32
+
+uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
+#define vsliq_n_u64 vsliq_n_s64
+
+poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
+#define vsliq_n_p8 vsliq_n_u8
+
+poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
+#define vsliq_n_p16 vsliq_n_u16
+
+// ***********************************************************************************************
+// ****************** Loads and stores of a single vector ***************************************
+// ***********************************************************************************************
+//Performs loads and stores of a single vector of some type.
+//*******************************  Loads ********************************************************
+// ***********************************************************************************************
+//We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
+//also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
+// it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
+//If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
+#define LOAD_SI128(ptr) \
+        ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
+
+uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+#define vld1q_u8 LOAD_SI128
+
+uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+#define vld1q_u16 LOAD_SI128
+
+uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+#define vld1q_u32 LOAD_SI128
+
+uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+#define vld1q_u64 LOAD_SI128
+
+int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+#define vld1q_s8 LOAD_SI128
+
+int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+#define vld1q_s16 LOAD_SI128
+
+int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+#define vld1q_s32 LOAD_SI128
+
+int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+#define vld1q_s64 LOAD_SI128
+
+float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr);         // VLD1.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
+/* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
+{__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
+__m128 f2;
+f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
+}*/
+
+float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
+{
+    if( (((unsigned long)(ptr)) & 15 ) == 0 )         //16 bits aligned
+        return _mm_load_ps(ptr);
+    else
+        return _mm_loadu_ps(ptr);
+}
+
+poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
+#define vld1q_p8  LOAD_SI128
+
+poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
+#define vld1q_p16 LOAD_SI128
+
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
+
+//***********************************************************************************************************
+//******* Lane load functions - insert the data at  vector's given position (lane) *************************
+//***********************************************************************************************************
+uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+#define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
+
+uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
+#define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane);         // _p;
+
+int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+#define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
+
+//current IA SIMD doesn't support float16
+
+float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
+{         //we need to deal with  ptr  16bit NOT aligned case
+    __m128 p;
+    p = _mm_set1_ps(*(ptr));
+    return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
+}
+
+int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
+#define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
+
+poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
+#define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
+
+poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
+#define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
+
+//serial solution may be faster
+
+//current IA SIMD doesn't support float16
+
+// ****************** Load single value ( set all lanes of vector with same value from memory)**********************
+// ******************************************************************************************************************
+uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
+
+uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
+
+uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
+
+uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr);         // VLD1.64 {d0}, [r0]
+_NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
+{
+    _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
+    return LOAD_SI128(val);
+}
+
+int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
+
+int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
+
+int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
+
+int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr);         // VLD1.64 {d0}, [r0]
+#define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
+
+float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr);         // VLD1.16 {d0[]}, [r0]
+//current IA SIMD doesn't support float16, need to go to 32 bits
+
+float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
+#define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
+
+poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
+#define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
+
+poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
+#define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
+
+//current IA SIMD doesn't support float16
+
+//*************************************************************************************
+//********************************* Store **********************************************
+//*************************************************************************************
+// If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
+//here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
+#define STORE_SI128(ptr, val) \
+        (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
+
+void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val);         // VST1.8 {d0, d1}, [r0]
+#define vst1q_u8 STORE_SI128
+
+void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val);         // VST1.16 {d0, d1}, [r0]
+#define vst1q_u16 STORE_SI128
+
+void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val);         // VST1.32 {d0, d1}, [r0]
+#define vst1q_u32 STORE_SI128
+
+void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val);         // VST1.64 {d0, d1}, [r0]
+#define vst1q_u64 STORE_SI128
+
+void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val);         // VST1.8 {d0, d1}, [r0]
+#define vst1q_s8 STORE_SI128
+
+void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val);         // VST1.16 {d0, d1}, [r0]
+#define vst1q_s16 STORE_SI128
+
+void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val);         // VST1.32 {d0, d1}, [r0]
+#define vst1q_s32 STORE_SI128
+
+void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val);         // VST1.64 {d0, d1}, [r0]
+#define vst1q_s64 STORE_SI128
+
+void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val);         // VST1.16 {d0, d1}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently
+
+void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val);         // VST1.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
+{
+    if( ((unsigned long)(ptr) & 15)  == 0 )         //16 bits aligned
+        _mm_store_ps (ptr, val);
+    else
+        _mm_storeu_ps (ptr, val);
+}
+
+void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val);         // VST1.8 {d0, d1}, [r0]
+#define vst1q_p8  vst1q_u8
+
+void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val);         // VST1.16 {d0, d1}, [r0]
+#define vst1q_p16 vst1q_u16
+
+//current IA SIMD doesn't support float16
+
+//***********Store a lane of a vector into memory (extract given lane) *********************
+//******************************************************************************************
+void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
+
+void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
+
+void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+#define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
+
+void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane);         // VST1.64 {d0}, [r0]
+#define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
+
+void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
+
+void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
+
+void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+#define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
+
+void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane);         // VST1.64 {d0}, [r0]
+#define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
+
+void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
+_NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
+{
+    int32_t ilane;
+    ilane = _MM_EXTRACT_PS(val,lane);
+    *(ptr) =  *((float*)&ilane);
+}
+
+void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
+#define vst1q_lane_p8   vst1q_lane_u8
+
+void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
+#define vst1q_lane_p16   vst1q_lane_s16
+
+//current IA SIMD doesn't support float16
+
+//***********************************************************************************************
+//**************** Loads and stores of an N-element structure **********************************
+//***********************************************************************************************
+//These intrinsics load or store an n-element structure. The array structures are defined in the beginning
+//We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
+//****************** 2 elements load  *********************************************
+uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr)         // VLD2.8 {d0, d2}, [r0]
+{
+    uint8x16x2_t v;
+    v.val[0] = vld1q_u8(ptr);
+    v.val[1] = vld1q_u8((ptr + 16));
+    v = vuzpq_s8(v.val[0], v.val[1]);
+    return v;
+}
+
+#if defined(USE_SSSE3)
+uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr)         // VLD2.16 {d0, d2}, [r0]
+{
+    uint16x8x2_t v;
+    v.val[0] = vld1q_u16( ptr);
+    v.val[1] = vld1q_u16( (ptr + 8));
+    v = vuzpq_s16(v.val[0], v.val[1]);
+    return v;
+}
+#endif
+
+uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr)         // VLD2.32 {d0, d2}, [r0]
+{
+    uint32x4x2_t v;
+    v.val[0] = vld1q_u32 ( ptr);
+    v.val[1] = vld1q_u32 ( (ptr + 4));
+    v = vuzpq_s32(v.val[0], v.val[1]);
+    return v;
+}
+
+int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
+#define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
+
+#if defined(USE_SSSE3)
+int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+#define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
+#endif
+
+int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
+#define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
+
+float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr);         // VLD2.16 {d0, d2}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr)         // VLD2.32 {d0, d2}, [r0]
+{
+    float32x4x2_t v;
+    v.val[0] =  vld1q_f32 (ptr);
+    v.val[1] =  vld1q_f32 ((ptr + 4));
+    v = vuzpq_f32(v.val[0], v.val[1]);
+    return v;
+}
+
+poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
+#define  vld2q_p8 vld2q_u8
+
+#if defined(USE_SSSE3)
+poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
+#define vld2q_p16 vld2q_u16
+#endif
+
+#if defined(USE_SSSE3)
+uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
+{
+    uint8x8x2_t v;
+    _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
+    __m128i ld128;
+    ld128 = vld1q_u8(ptr);         //merge two 64-bits in 128 bit
+    v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    return v;
+}
+#endif
+
+#if defined(USE_SSSE3)
+uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+_NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
+{
+    uint16x4x2_t v;
+    _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
+    __m128i ld128;
+    ld128 = vld1q_u16(ptr);         //merge two 64-bits in 128 bit
+    v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    return v;
+}
+#endif
+
+uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
+{
+    uint32x2x2_t v;
+    __m128i ld128;
+    ld128 = vld1q_u32(ptr);         //merge two 64-bits in 128 bit
+    v.val[0] = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    return v;
+}
+
+uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+_NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
+{
+    uint64x1x2_t v;
+    v.val[0] = vld1q_u64(ptr);
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    return v;
+}
+
+#if defined(USE_SSSE3)
+int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+#define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
+
+int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+#define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
+#endif
+
+int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+#define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
+
+int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+#define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
+
+float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr);         // VLD2.16 {d0, d1}, [r0]
+
+float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
+{
+    float32x2x2_t v;
+    v.val[0] = vld1q_f32(ptr);
+    v.val[0] = _mm_shuffle_ps(v.val[0], v.val[0], _MM_SHUFFLE(3,1, 2, 0));
+    v.val[1] = _mm_movehl_ps(v.val[0],v.val[0]);
+    return v;
+}
+
+#if defined(USE_SSSE3)
+poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
+#define vld2_p8 vld2_u8
+
+poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
+#define vld2_p16 vld2_u16
+#endif
+
+//******************** Triplets ***************************************
+//*********************************************************************
+#if defined(USE_SSSE3)
+uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr)         // VLD3.8 {d0, d2, d4}, [r0]
+{  //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
+   //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
+   //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
+   //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
+    uint8x16x3_t v;
+    __m128i tmp0, tmp1,tmp2, tmp3;
+    _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
+    _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
+    _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
+
+    v.val[0] =  vld1q_u8 (ptr);        //a0,a1,a2,a3,...a7, ...a15
+    v.val[1] =  vld1q_u8 ((ptr + 16));	//b0,b1,b2,b3...b7, ...b15
+    v.val[2] =  vld1q_u8 ((ptr + 32));  //c0,c1,c2,c3,...c7,...c15
+
+    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
+    tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
+    tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
+
+    tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
+    tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
+    tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
+    tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
+    v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
+    v.val[0] = _mm_or_si128(v.val[0],tmp3) ;//a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
+
+    tmp3 = _mm_slli_si128(tmp0, 5);//0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
+    tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
+    v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
+    v.val[1] = _mm_slli_si128(v.val[1], 5);//0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
+    v.val[1] =	_mm_slli_si128(v.val[1],5);//0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
+    v.val[1] = _mm_srli_si128(v.val[1], 5);//a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
+    tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
+    tmp3 = _mm_slli_si128(tmp3,11);//0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
+
+    tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
+    tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
+    v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
+    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
+    v.val[2] = _mm_or_si128(v.val[2],tmp0);//a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
+    return v;
+}
+#endif
+
+#if defined(USE_SSSE3)
+uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr)         // VLD3.16 {d0, d2, d4}, [r0]
+{  //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
+    uint16x8x3_t v;
+    __m128i tmp0, tmp1,tmp2, tmp3;
+    _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
+    _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
+    _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
+
+    v.val[0] =  vld1q_u16 (ptr);        //a0,a1,a2,a3,...a7,
+    v.val[1] =  vld1q_u16 ((ptr + 8));	//b0,b1,b2,b3...b7
+    v.val[2] =  vld1q_u16 ((ptr + 16));  //c0,c1,c2,c3,...c7
+
+    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
+    tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
+    tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
+
+    tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
+    tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
+    tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
+    tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
+    v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
+    v.val[0] = _mm_or_si128(v.val[0],tmp3);//a0,a3,a6,b1,b4,b7,c2,c5
+
+    tmp3 = _mm_slli_si128(tmp0, 4);//0,0,a0,a3,a6,a1,a4,a7
+    tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
+    v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
+    v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3);//a1,a4,a7,b2,b5,b0,b3,b6,
+    v.val[1] =	_mm_slli_si128(v.val[1],6);//0,0,0,a1,a4,a7,b2,b5,
+    v.val[1] = _mm_srli_si128(v.val[1], 6);//a1,a4,a7,b2,b5,0,0,0,
+    tmp3 = _mm_srli_si128(tmp2,4);  //c0,c3,c6, c1,c4,c7,0,0
+    tmp3 = _mm_slli_si128(tmp3,10);  //0,0,0,0,0,c0,c3,c6,
+    v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
+
+    tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
+    tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
+    v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
+    v.val[2] = _mm_slli_si128(v.val[2],4);//0,0, b0,b3,b6,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0, b0,b3,b6,c1,c4,c7,
+    tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
+    v.val[2] = _mm_or_si128(v.val[2],tmp0);//a2,a5,b0,b3,b6,c1,c4,c7,
+    return v;
+}
+#endif
+
+uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr)         // VLD3.32 {d0, d2, d4}, [r0]
+{//a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+    uint32x4x3_t v;
+    __m128i tmp0, tmp1,tmp2, tmp3;
+    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,a2,a3,
+    v.val[1] =  vld1q_u32 ((ptr + 4));	//b0,b1,b2,b3
+    v.val[2] =  vld1q_u32 ((ptr + 8));  //c0,c1,c2,c3,
+
+    tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
+    tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
+    tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
+
+    tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
+    v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
+    tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
+    v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
+    v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
+    v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
+    return v;
+}
+
+#if defined(USE_SSSE3)
+int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+#define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
+
+int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+#define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
+#endif
+
+int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
+#define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
+
+float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr)         // VLD3.32 {d0, d2, d4}, [r0]
+{ //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+    float32x4x3_t v;
+    __m128 tmp0, tmp1,tmp2, tmp3;
+    v.val[0] =  vld1q_f32 (ptr);        //a0,a1,a2,a3,
+    v.val[1] =  vld1q_f32 ((ptr + 4));	//b0,b1,b2,b3
+    v.val[2] =  vld1q_f32 ((ptr + 8));  //c0,c1,c2,c3,
+
+    tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
+    tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
+    tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
+    tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
+
+    v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
+    tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
+    v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
+    v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
+    v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
+    return v;
+}
+
+#if defined(USE_SSSE3)
+poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
+#define vld3q_p8 vld3q_u8
+
+poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
+#define vld3q_p16 vld3q_u16
+#endif
+
+#if defined(USE_SSSE3)
+uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr)         // VLD3.8 {d0, d1, d2}, [r0]
+{ //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
+    uint8x8x3_t v;
+    __m128i tmp0, tmp1;
+    _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
+    _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
+    v.val[0] =  vld1q_u8 (ptr);        //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
+
+    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
+    tmp1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
+    v.val[0] = _mm_slli_si128(tmp0,10);
+    v.val[0] = _mm_srli_si128(v.val[0],10);  //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
+    v.val[2] = _mm_slli_si128(tmp1,6);//0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
+    v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
+
+    v.val[1] = _mm_slli_si128(tmp0,5);  //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
+    v.val[1] = _mm_srli_si128(v.val[1],11);  //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
+    v.val[2] = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
+    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
+    v.val[1] = _mm_or_si128(v.val[1],v.val[2]) ;//a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
+
+    tmp0 = _mm_srli_si128(tmp0,11);  //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
+    v.val[2] = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
+    v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c1,c4,c7,
+    v.val[2] = _mm_or_si128(tmp0, v.val[2]) ;//a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
+    return v;
+}
+#endif
+
+#if defined(USE_SSSE3)
+uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr)         // VLD3.16 {d0, d1, d2}, [r0]
+{ //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
+    uint16x4x3_t v;
+    __m128i tmp0, tmp1;
+    _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
+    v.val[0] =  vld1q_u16 (ptr);        //a0,a1,a2,a3,  b0,b1,b2,b3
+
+    tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
+    tmp1 = _mm_shufflelo_epi16(v.val[2], 201); //11 00 10 01     : c1, c2, c0, c3,
+    v.val[0] = _mm_slli_si128(tmp0,10);
+    v.val[0] = _mm_srli_si128(v.val[0],10);  //a0, a3, b2, 0,0, 0,0,
+    v.val[2] = _mm_slli_si128(tmp1,14);//0,0,0,0,0,0,0,c1
+    v.val[2] = _mm_srli_si128(v.val[2],8);//0,0,0,c1,0,0,0,0
+    v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0, a3, b2, c1, x,x,x,x
+
+    v.val[1] = _mm_slli_si128(tmp0,4);  //0,0,0,0,0,a1, b0, b3
+    v.val[1] = _mm_srli_si128(v.val[1],10);  //a1, b0, b3, 0,0, 0,0,
+    v.val[2] = _mm_srli_si128(tmp1,2);//c2, 0,0,0,0,0,0,0,
+    v.val[2] = _mm_slli_si128(v.val[2],6);//0,0,0,c2,0,0,0,0
+    v.val[1] = _mm_or_si128(v.val[1],v.val[2]); //a1, b0, b3, c2, x,x,x,x
+
+    tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
+    tmp1 = _mm_srli_si128(tmp1,4);
+    tmp1 = _mm_slli_si128(tmp1,4);  //0,0,c0, c3,
+    v.val[2] = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
+    return v;
+}
+#endif
+
+uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr)         // VLD3.32 {d0, d1, d2}, [r0]
+{ //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
+    uint32x2x3_t v;
+    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,  b0,b1,
+
+    v.val[0] = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6));  //a0,b1, a1, b0
+    v.val[2] =  _mm_slli_si128(v.val[2], 8);  //x, x,c0,c1,
+    v.val[1] =  _mm_unpackhi_epi32(v.val[0],v.val[2]); //a1,c0, b0, c1
+    v.val[2] =  _mm_srli_si128(v.val[1], 8);  //b0, c1, x, x,
+    return v;
+}
+uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2}, [r0]
+{
+    uint64x1x3_t v;
+    v.val[0] = vld1q_u64 (ptr);
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    return v;
+}
+
+#if defined(USE_SSSE3)
+int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+#define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
+
+int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+#define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
+#endif
+
+int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+#define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
+
+int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+#define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
+
+float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
+{ //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
+    float32x2x3_t v;
+    v.val[0] =  vld1q_f32 (ptr);        //a0,a1,  b0,b1,
+
+    v.val[0] = _mm_shuffle_ps(v.val[0],v.val[0], _MM_SHUFFLE(2,1, 3, 0));  //a0,b1, a1, b0
+    v.val[2] =  _mm_movelh_ps(v.val[2], v.val[2]);  //x, x,c0,c1,
+    v.val[1] =  _mm_unpackhi_ps(v.val[0],v.val[2]); //a1,c0, b0, c1
+    v.val[2] =  _mm_movehl_ps(v.val[1], v.val[1]);  //b0, c1, x, x,
+    return v;
+}
+
+#if defined(USE_SSSE3)
+poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
+#define vld3_p8 vld3_u8
+
+poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
+#define vld3_p16 vld3_u16
+#endif
+
+//***************  Quadruples load ********************************
+//*****************************************************************
+uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr)         // VLD4.8 {d0, d2, d4, d6}, [r0]
+{
+    uint8x16x4_t v;
+   __m128i tmp3, tmp2, tmp1, tmp0;
+
+    v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
+    v.val[1] = vld1q_u8 ( (ptr + 16));//b0, b1,b2,...b7.... b15
+    v.val[2] = vld1q_u8 ( (ptr + 32));//c0, c1,c2,...c7....c15
+    v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
+
+    tmp0= _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
+    tmp1= _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
+    tmp2= _mm_unpackhi_epi8(v.val[0],v.val[1]);//a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
+    tmp3= _mm_unpackhi_epi8(v.val[2],v.val[3]);//c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
+
+    v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
+    v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
+    v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
+    v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
+
+    tmp0 =  _mm_unpacklo_epi32(v.val[0] , v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
+    tmp1 =  _mm_unpackhi_epi32(v.val[0] , v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
+    tmp2 =  _mm_unpacklo_epi32(v.val[1] , v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
+    tmp3 =  _mm_unpackhi_epi32(v.val[1] , v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
+
+    v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
+    v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
+    v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
+    v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
+    return v;
+}
+
+uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr)         // VLD4.16 {d0, d2, d4, d6}, [r0]
+{
+    uint16x8x4_t v;
+    __m128i tmp3, tmp2, tmp1, tmp0;
+    tmp0  =  vld1q_u16 (ptr);       //a0,a1,a2,...a7
+    tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
+    tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
+    tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
+    v.val[0]= _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
+    v.val[1]= _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
+    v.val[2]= _mm_unpackhi_epi16(tmp0,tmp1);//a4,b4, a5,b5, a6,b6, a7,b7
+    v.val[3]= _mm_unpackhi_epi16(tmp2,tmp3);//c4,d4, c5,d5, c6,d6, c7,d7
+    tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]);//a0,a4, b0,b4, a1,a5, b1,b5
+    tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
+    tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
+    tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]);//c2,c6, d2,d6, c3,c7, d3,d7
+    v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
+    v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
+    v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
+    v.val[3] =  _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
+    return v;
+}
+
+uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr)         // VLD4.32 {d0, d2, d4, d6}, [r0]
+{
+    uint32x4x4_t v;
+    __m128i tmp3, tmp2, tmp1, tmp0;
+    v.val[0] =  vld1q_u32 (ptr);
+    v.val[1] =  vld1q_u32 ((ptr + 4));
+    v.val[2] =  vld1q_u32 ((ptr + 8));
+    v.val[3] =  vld1q_u32 ((ptr + 12));
+    tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
+    tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
+    tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
+    tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
+    v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
+    v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
+    v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
+    v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
+    return v;
+}
+
+int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+#define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
+
+int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+#define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
+
+int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
+#define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
+
+float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr)         // VLD4.32 {d0, d2, d4, d6}, [r0]
+{
+    float32x4x4_t v;
+    __m128 tmp3, tmp2, tmp1, tmp0;
+
+    v.val[0] =  vld1q_f32 ((float*) ptr);
+    v.val[1] =  vld1q_f32 ((float*) (ptr + 4));
+    v.val[2] =  vld1q_f32 ((float*) (ptr + 8));
+    v.val[3] =  vld1q_f32 ((float*) (ptr + 12));
+    tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
+    tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
+    tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
+    tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
+    v.val[0] = _mm_movelh_ps(tmp0, tmp2);
+    v.val[1] = _mm_movehl_ps(tmp2, tmp0);
+    v.val[2] = _mm_movelh_ps(tmp1, tmp3);
+    v.val[3] = _mm_movehl_ps(tmp3, tmp1);
+    return v;
+}
+
+poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
+#define vld4q_p8 vld4q_u8
+
+poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
+#define vld4q_p16 vld4q_s16
+
+#if defined(USE_SSSE3)
+uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr)         // VLD4.8 {d0, d1, d2, d3}, [r0]
+{
+    uint8x8x4_t v;
+    __m128i sh0, sh1;
+    _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
+
+    v.val[0] = vld1q_u8(( ptr));         //load first 64-bits in val[0] and val[1]
+    v.val[1] = vld1q_u8(( ptr + 16));         //load third and forth 64-bits in val[2], val[3]
+
+    sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_8);
+    sh1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask4_8);
+    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
+    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
+    v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32);
+    v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32);
+
+    return v;
+}
+#endif
+
+#if defined(USE_SSSE3)
+uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr)         // VLD4.16 {d0, d1, d2, d3}, [r0]
+{
+    uint16x4x4_t v;
+    __m128i sh0, sh1;
+    _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};         //0, 4, 1, 5, 2, 6, 3, 7
+    v.val[0] = vld1q_u16 ( (ptr));         //load first 64-bits in val[0] and val[1]
+    v.val[2] = vld1q_u16 ( (ptr + 8));         //load third and forth 64-bits in val[2], val[3]
+    sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_16);
+    sh1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask4_16);
+    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         //0,4,8,12, 1,5,9,13
+    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         //2,6,10,14, 3,7,11,15
+    v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32);
+    v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32);
+    return v;
+}
+#endif
+
+uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
+{   //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+     uint32x4x4_t v, res;
+    v.val[0] =  vld1q_u32 (ptr);        //a0,a1,  b0,b1,
+    v.val[2] =  vld1q_u32 ((ptr + 4));  //c0,c1, d0,d1
+    res.val[0] = _mm_unpacklo_epi32(v.val[0],v.val[2]);  //a0, c0, a1,c1,
+    res.val[2] = _mm_unpackhi_epi32(v.val[0],v.val[2]);  //b0,d0, b1, d1
+    res.val[1] = _mm_shuffle_epi32(res.val[0],_SWAP_HI_LOW32); //a1,c1, a0, c0,
+    res.val[3] = _mm_shuffle_epi32(res.val[2],_SWAP_HI_LOW32);//b1, d1,b0,d0,
+    return res;
+}
+
+uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2, d3}, [r0]
+{
+    uint64x1x4_t v;
+    v.val[0] = vld1q_u64( (ptr));         //load first 64-bits in val[0] and val[1]
+    v.val[2] = vld1q_u64( (ptr + 2));         //load third and forth 64-bits in val[2], val[3]
+    return v;
+}
+
+#if defined(USE_SSSE3)
+int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+#define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
+
+int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+#define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
+#endif
+
+int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+#define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
+
+int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+#define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
+
+float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr)         // VLD4.32 {d0, d1, d2, d3}, [r0]
+{         //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+    float32x2x4_t v, res;
+    v.val[0] =  vld1q_f32 ((float*) ptr);         //a0,a1,  b0,b1,
+    v.val[2] =  vld1q_f32 ((float*) (ptr + 4));         //c0,c1, d0,d1
+    res.val[0] = _mm_unpacklo_ps(v.val[0],v.val[2]);         //a0, c0, a1,c1,
+    res.val[2] = _mm_unpackhi_ps(v.val[0],v.val[2]);         //b0,d0, b1, d1
+    res.val[1] = _mm_movehl_ps(res.val[0],res.val[0]);          // a1,c1, a0, c0,
+    res.val[3] = _mm_movehl_ps(res.val[2],res.val[2]);         // b1, d1, b0,d0,
+    return res;
+}
+
+#if defined(USE_SSSE3)
+poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
+#define vld4_p8 vld4_u8
+
+poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
+#define vld4_p16 vld4_u16
+#endif
+
+//************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
+//*******************************************************************************************************************
+uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr)         // VLD2.8 {d0[], d1[]}, [r0]
+{
+    uint8x8x2_t v;
+    v.val[0] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
+    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
+    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
+    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    return v;
+}
+
+uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr)         // VLD2.16 {d0[], d1[]}, [r0]
+{
+    uint16x4x2_t v;
+    v.val[1] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
+    v.val[0] = _mm_shufflelo_epi16(v.val[1], 0); //00 00 00 00 (all 0)
+    v.val[1] = _mm_shufflelo_epi16(v.val[1], 85);//01 01 01 01 (all 1)
+    return v;
+}
+
+uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr)         // VLD2.32 {d0[], d1[]}, [r0]
+{
+    uint32x2x2_t v;
+    v.val[0] = LOAD_SI128(ptr); //0,1,x,x
+    v.val[0] = _mm_shuffle_epi32(v.val[0],   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
+    v.val[1] = _mm_srli_si128(v.val[0], 8); //1,1,0x0,0x0
+    return v;
+}
+
+uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+#define vld2_dup_u64 vld2_u64
+
+int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+#define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
+
+int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+#define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
+
+int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
+#define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
+
+int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
+#define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
+
+float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
+_NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr)         // VLD2.32 {d0[], d1[]}, [r0]
+{
+    float32x2x2_t v;
+    v.val[0] = vld1q_f32(ptr);  //0,1,x,x
+    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x
+    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,x,x
+    return v;
+}
+
+poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
+#define vld2_dup_p8 vld2_dup_u8
+
+poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
+#define vld2_dup_p16 vld2_dup_s16
+
+//************* Duplicate (or propagate)triplets: *******************
+//********************************************************************
+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
+uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr)         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+{
+    uint8x8x3_t v;
+    v.val[0] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
+    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
+    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
+    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    return v;
+}
+
+uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr)         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+{
+    uint16x4x3_t v;
+    v.val[2] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
+    v.val[0] = _mm_shufflelo_epi16(v.val[2], 0); //00 00 00 00 (all 0)
+    v.val[1] = _mm_shufflelo_epi16(v.val[2], 85);//01 01 01 01 (all 1)
+    v.val[2] = _mm_shufflelo_epi16(v.val[2], 170);//10 10 10 10 (all 2)
+    return v;
+}
+
+uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr)         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+{
+    uint32x2x3_t v;
+    v.val[2] = LOAD_SI128(ptr); //0,1,2,x
+    v.val[0] = _mm_shuffle_epi32(v.val[2],   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
+    v.val[1] = _mm_shuffle_epi32(v.val[2],   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
+    v.val[2] = _mm_srli_si128(v.val[0], 8); //2,2,0x0,0x0
+    return v;
+}
+
+uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2}, [r0]
+{
+    uint64x1x3_t v;
+    v.val[0] = LOAD_SI128(ptr);//0,1,
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0
+    v.val[2] = LOAD_SI128((ptr + 2));  //2,x
+    return v;
+}
+
+int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
+
+int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
+
+int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
+
+int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
+#define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
+
+float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr)         // VLD3.32 {d0[], d1[], d2[]}, [r0]
+{
+    float32x2x3_t v;
+    v.val[0] = vld1q_f32(ptr);  //0,1,2,x
+    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x
+    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2
+    v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0,
+    return v;
+}
+
+poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_p8 vld3_dup_u8
+
+poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
+#define vld3_dup_p16 vld3_dup_s16
+
+//************* Duplicate (or propagate) quadruples: *******************
+//***********************************************************************
+//ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
+uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr)         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+{
+    uint8x8x4_t v;
+    v.val[0] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
+    v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
+    v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
+    v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
+    v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
+    v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32);
+    return v;
+}
+
+uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr)         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+{
+    uint16x4x4_t v;
+    v.val[3] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
+    v.val[0] = _mm_shufflelo_epi16(v.val[3], 0); //00 00 00 00 (all 0)
+    v.val[1] = _mm_shufflelo_epi16(v.val[3], 85);//01 01 01 01 (all 1)
+    v.val[2] = _mm_shufflelo_epi16(v.val[3], 170);//10 10 10 10 (all 2)
+    v.val[3] = _mm_shufflelo_epi16(v.val[3], 255);//11 11 11 11 (all 3)
+    return v;
+}
+
+uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr)         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+{
+    uint32x2x4_t v;
+    v.val[3] = LOAD_SI128(ptr) ; //0,1,2,3
+    v.val[0] = _mm_shuffle_epi32(v.val[3],   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
+    v.val[1] = _mm_shuffle_epi32(v.val[3],   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
+    v.val[2] = _mm_shuffle_epi32(v.val[3],   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
+    v.val[3] = _mm_shuffle_epi32(v.val[3],   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
+    return v;
+}
+
+uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2, d3}, [r0]
+{
+    uint64x1x4_t v;
+    v.val[0] = LOAD_SI128(ptr); //0,1,
+    v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0
+    v.val[2] = LOAD_SI128((ptr + 2)); //2,3
+    v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32); //3,2
+    return v;
+}
+
+int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
+
+int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
+
+int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
+
+int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
+#define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
+
+float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr)         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
+{
+    float32x2x4_t v;
+    v.val[0] = vld1q_f32(ptr);  //0,1,2,3
+    v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,3,3
+    v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2
+    v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0,
+    v.val[3] = _mm_movehl_ps(v.val[1], v.val[1]); //3,3,1,1,
+    return v;
+}
+
+poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_p8 vld4_dup_u8
+
+poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
+#define vld4_dup_p16 vld4_dup_u16
+
+//**********************************************************************************
+//*******************Lane loads for  an N-element structures ***********************
+//**********************************************************************************
+//********************** Lane pairs  ************************************************
+//does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
+//we assume  src is 16 bit aligned
+
+//!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
+//to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
+
+//uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane)         // VLD2.16 {d0[0], d2[0]}, [r0]
+{
+    uint16x8x2_t v;
+    v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
+    v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
+    return v;
+}
+#define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
+
+//uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane)         // VLD2.32 {d0[0], d2[0]}, [r0]
+{
+    uint32x4x2_t v;
+    v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
+    return v;
+}
+#define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
+
+//int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
+{
+    int16x8x2_t v;
+    v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
+    v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
+    return v;
+}
+#define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
+
+//int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
+{
+    int32x4x2_t v;
+    v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
+    return v;
+}
+#define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
+
+//float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+//float32x4x2_t vld2q_lane_f32(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane)         // VLD2.32 {d0[0], d2[0]}, [r0]
+{
+    float32x4x2_t v;
+    v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
+    v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
+    return v;
+}
+#define vld2q_lane_f32(ptr, src, lane) vld2q_lane_f32_ptr(ptr, &src, lane)
+
+//poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
+#define vld2q_lane_p16 vld2q_lane_u16
+
+//uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane)         // VLD2.8 {d0[0], d1[0]}, [r0]
+{
+    uint8x8x2_t val;
+    val.val[0] = _MM_INSERT_EPI8 (src->val[0], (int)ptr[0], lane);
+    val.val[1] = _MM_INSERT_EPI8 (src->val[1], (int)ptr[1], lane);
+    return val;
+}
+#define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
+
+//uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
+#define vld2_lane_u16 vld2q_lane_u16
+
+//uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
+#define vld2_lane_u32 vld2q_lane_u32
+
+//int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
+int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane);         // VLD2.8 {d0[0], d1[0]}, [r0]
+#define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
+
+//int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
+int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
+#define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
+
+//int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
+int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
+#define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
+
+//float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
+#define vld2_lane_f32 vld2q_lane_f32
+
+//poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
+poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane);         // VLD2.8 {d0[0], d1[0]}, [r0]
+#define vld2_lane_p8 vld2_lane_u8
+
+//poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
+poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
+#define vld2_lane_p16 vld2_lane_u16
+
+//*********** Lane triplets **********************
+//*************************************************
+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
+//we assume src is 16 bit aligned
+
+//uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane)         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+{
+    uint16x8x3_t v;
+    v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
+    return v;
+}
+#define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
+
+//uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+{
+    uint32x4x3_t v;
+    v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
+    return v;
+}
+#define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
+
+//int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane)         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+{
+    int16x8x3_t v;
+    v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
+    return v;
+}
+#define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
+
+//int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+{
+    int32x4x3_t v;
+    v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
+    return v;
+}
+#define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
+
+float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+//current IA SIMD doesn't support float16
+#define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
+
+//float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
+{
+    float32x4x3_t v;
+    v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
+    v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
+    v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
+    return v;
+}
+#define vld3q_lane_f32(ptr, src, lane) vld3q_lane_f32_ptr(ptr, &src, lane)
+
+poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
+#define vld3q_lane_p16 vld3q_lane_u16
+
+//uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane)         // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+{
+    uint8x8x3_t v;
+    v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane);
+    return v;
+}
+#define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
+
+//uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane)         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+{
+    uint16x4x3_t v;
+    v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane);
+    return v;
+}
+#define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
+
+//uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane)         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+{         //need to merge into 128 bit anyway
+    uint32x2x3_t v;
+    v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane);
+    return v;
+}
+#define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
+
+int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane);         // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
+
+int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
+
+int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
+
+float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+//float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+_NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane)         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
+{
+    float32x2x3_t v;
+    v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
+    return v;
+}
+#define vld3_lane_f32(ptr, src, lane) vld3_lane_f32_ptr(ptr, &src, lane)
+
+//poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_p8 vld3_lane_u8
+
+//poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
+#define vld3_lane_p16 vld3_lane_u16
+
+//******************* Lane Quadruples  load ***************************
+//*********************************************************************
+//does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
+//we assume src is 16 bit aligned
+
+//uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
+{
+    uint16x8x4_t v;
+    v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
+    v.val[3] = _MM_INSERT_EPI16 ( src->val[3],  ptr[3], lane);
+    return v;
+}
+#define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
+
+//uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
+{
+    uint32x4x4_t v;
+    v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
+    v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
+    v.val[3] = _MM_INSERT_EPI32 ( src->val[3],  ptr[3], lane);
+    return v;
+}
+#define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
+
+//int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+#define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
+
+//int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+#define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
+
+//float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+//float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
+{
+    float32x4x4_t v;
+    v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
+    v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
+    v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
+    v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
+    return v;
+}
+#define vld4q_lane_f32(ptr, src, lane) vld4q_lane_f32_ptr(ptr, &src, lane)
+
+//poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+#define vld4q_lane_p16 vld4q_lane_u16
+
+//uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane)
+{
+    uint8x8x4_t v;
+    v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane );
+    v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane );
+    v.val[3] = _MM_INSERT_EPI8 (src->val[3], ptr[3], lane );
+    return v;
+}
+#define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
+
+//uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane)
+{
+    uint16x4x4_t v;
+    v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane );
+    v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane );
+    v.val[3] = _MM_INSERT_EPI16 (src->val[3], ptr[3], lane );
+    return v;
+}
+#define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
+
+//uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane)
+{
+    uint32x2x4_t v;
+    v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
+    v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane );
+    v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane );
+    v.val[3] = _MM_INSERT_EPI32 (src->val[3], ptr[3], lane );
+    return v;
+}
+#define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
+
+//int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);
+#define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
+
+//int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);
+#define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
+
+//int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);
+#define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
+
+//float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
+//current IA SIMD doesn't support float16
+
+//float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane)
+{         //serial solution may be faster
+    float32x2x4_t v;
+    return v;
+}
+#define vld4_lane_f32(ptr, src, lane) vld4_lane_f32_ptr(ptr, &src, lane)
+
+//poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);
+#define vld4_lane_p8 vld4_lane_u8
+
+//poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);
+#define vld4_lane_p16 vld4_lane_u16
+
+//******************* Store duplets *********************************************
+//********************************************************************************
+//here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function
+//If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions
+//void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
+_NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
+{
+    uint8x16x2_t v;
+    v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
+    v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
+    vst1q_u8 (ptr, v.val[0]);
+    vst1q_u8 ((ptr + 16),  v.val[1]);
+}
+#define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
+
+//void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
+_NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
+{
+    uint16x8x2_t v;
+    v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
+    v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
+    vst1q_u16 (ptr, v.val[0]);
+    vst1q_u16 ((ptr + 8),  v.val[1]);
+}
+#define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
+
+//void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
+{
+    uint32x4x2_t v;
+    v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
+    v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
+    vst1q_u32 (ptr, v.val[0]);
+    vst1q_u32 ((ptr + 4),  v.val[1]);
+}
+#define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
+
+//void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
+void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
+#define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
+
+//void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
+void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
+#define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
+
+//void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
+void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
+#define vst2q_s32(ptr, val)  vst2q_u32((uint32_t*)(ptr), val)
+
+//void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
+void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
+// IA32 SIMD doesn't work with 16bit floats currently
+
+//void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
+_NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
+{
+    float32x4x2_t v;
+    v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
+    v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
+    vst1q_f32 (ptr, v.val[0]);
+    vst1q_f32 ((ptr + 4),  v.val[1]);
+}
+#define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
+
+//void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
+void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
+#define vst2q_p8 vst2q_u8
+
+//void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
+void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
+#define vst2q_p16 vst2q_u16
+
+//void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val)
+{
+    uint8x8x2_t v;
+    v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
+    vst1q_u8 (ptr, v.val[0]);
+}
+#define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
+
+//void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val)
+{
+    uint16x4x2_t v;
+    v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
+    vst1q_u16 (ptr, v.val[0]);
+}
+#define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
+
+//void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val)
+{
+    uint32x2x2_t v;
+    v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
+    vst1q_u32 (ptr, v.val[0]);
+}
+#define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
+
+//void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
+void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);
+_NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val)
+{
+    uint64x1x2_t v;
+    v.val[0] = _mm_unpacklo_epi64(val->val[0], val->val[1]);
+    vst1q_u64(ptr, v.val[0]);
+}
+#define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
+
+//void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
+#define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
+
+//void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+#define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
+
+//void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
+#define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
+
+//void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
+#define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
+
+//void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
+//current IA SIMD doesn't support float16
+
+void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
+_NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val)
+{
+    float32x4x2_t v;
+    v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
+    vst1q_f32 (ptr, v.val[0]);
+}
+#define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
+
+//void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
+#define vst2_p8 vst2_u8
+
+//void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
+#define vst2_p16 vst2_u16
+
+//******************** Triplets store  *****************************************
+//******************************************************************************
+//void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
+#if defined(USE_SSSE3)
+_NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
+{
+    uint8x16x3_t v;
+    __m128i v0,v1,v2, cff, bldmask;
+    _NEON2SSE_ALIGN_16 uint8_t mask0[16]   = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
+    _NEON2SSE_ALIGN_16 uint8_t mask1[16]   = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
+    _NEON2SSE_ALIGN_16 uint8_t mask2[16] =    {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
+    _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
+    _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
+    _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
+
+    v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]);         //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
+    v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]);         //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
+    v1 =  _mm_alignr_epi8(v2, v0, 11);         //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
+    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0);         //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo);         //make plugs for the v.val[2] data embedding
+    cff = _mm_cmpeq_epi8(v0, v0);         //all ff
+    bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
+    v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
+    vst1q_u8(ptr,   v.val[0]);
+    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1);         //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med);         //make plugs for the v.val[2] data embedding
+    bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
+    v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
+    vst1q_u8((ptr + 16),  v.val[1]);
+    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2);         //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi);         //make plugs for the v.val[2] data embedding
+    bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
+    v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
+    vst1q_u8((ptr + 32),  v.val[2]);
+}
+#define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
+#endif
+
+#if defined(USE_SSSE3)
+//void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
+{
+    uint16x8x3_t v;
+    __m128i v0,v1,v2, cff, bldmask;
+    _NEON2SSE_ALIGN_16 uint8_t mask0[16]   = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
+    _NEON2SSE_ALIGN_16 uint8_t mask1[16]   = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
+    _NEON2SSE_ALIGN_16 uint8_t mask2[16] =    {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
+    _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
+    _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
+    _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
+
+    v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]);         //0,1, 3,4, 6,7, 9,10
+    v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]);         //12,13, 15,16, 18,19, 21,22,
+    v1 =  _mm_alignr_epi8(v2, v0, 12);         //9,10, 12,13, 15,16, 18,19
+    v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0);         //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo);         //make plugs for the v.val[2] data embedding
+    cff = _mm_cmpeq_epi16(v0, v0);         //all ff
+    bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
+    v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
+    vst1q_u16(ptr,      v.val[0]);
+    v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1);         //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med);         //make plugs for the v.val[2] data embedding
+    bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
+    v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
+    vst1q_u16((ptr + 8),  v.val[1]);
+    v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2);         //make holes for the v.val[2] data embedding
+    v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi);         //make plugs for the v.val[2] data embedding
+    bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
+    v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
+    vst1q_u16((ptr + 16), v.val[2]);
+}
+#define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
+#endif
+
+//void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
+{   //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
+    uint32x4x3_t v;
+    __m128i tmp0, tmp1,tmp2;
+    tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
+    tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
+    tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
+    v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
+    v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
+    v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
+    tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
+    v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
+
+    vst1q_u32(ptr,      v.val[0]);
+    vst1q_u32((ptr + 4),  v.val[1]);
+    vst1q_u32((ptr + 8),  v.val[2]);
+}
+#define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
+
+#if defined(USE_SSSE3)
+//void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
+void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
+#define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
+
+//void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
+void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
+#define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
+#endif
+
+//void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
+void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
+#define vst3q_s32(ptr, val)  vst3q_u32((uint32_t*)(ptr), val)
+
+//void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
+void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
+// IA32 SIMD doesn't work with 16bit floats currently
+
+//void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
+_NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
+{
+     float32x4x3_t  v;
+    __m128 tmp0, tmp1,tmp2;
+    tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
+    tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
+    tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
+    v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
+    v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
+    v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
+    tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
+    v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
+
+    vst1q_f32( ptr,    v.val[0]);
+    vst1q_f32( (ptr + 4),  v.val[1]);
+    vst1q_f32( (ptr + 8),  v.val[2]);
+}
+#define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
+
+#if defined(USE_SSSE3)
+//void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
+void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
+#define vst3q_p8 vst3q_u8
+
+//void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
+void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
+#define vst3q_p16 vst3q_u16
+#endif
+
+//void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
+#if defined(USE_SSSE3)
+_NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val)
+{
+    uint8x8x3_t v;
+    __m128i tmp, sh0, sh1;
+    _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
+    _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
+    _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
+    _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
+    tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]);
+    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0);         //for bi>15 bi is wrapped (bi-=15)
+    sh1 =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0);
+    v.val[0] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
+    vst1q_u8(ptr,   v.val[0]);         //store as 128 bit structure
+    sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1);         //for bi>15 bi is wrapped (bi-=15)
+    sh1 =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1);
+    v.val[1] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
+}
+#define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
+#endif
+
+//void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
+#if defined(USE_SSSE3)
+_NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val)
+{
+    uint16x4x3_t v;
+    __m128i tmp;
+    _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
+    _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
+    _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff};         //if all ones we take the result from v.val[0]  otherwise from v.val[1]
+    _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff};         //if all ones we take the result from v.val[1]  otherwise from v.val[0]
+    tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]);
+    v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
+    v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0);
+    v.val[0] = _MM_BLENDV_EPI8(v.val[1], v.val[0], *(__m128i*)mask0f);
+    vst1q_u16(ptr,     v.val[0]);         //store as 128 bit structure
+    v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
+    v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1);
+    v.val[1] = _MM_BLENDV_EPI8(v.val[0], v.val[1],  *(__m128i*)mask1f);         //change the operands order
+}
+#define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
+#endif
+
+//void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val)
+{         //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
+    uint32x2x3_t res;
+    res.val[0] = _mm_unpacklo_epi64(val->val[1], val->val[2]);         //val[0]: 1,4,2,5
+    res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));         //1,2,4,5
+    res.val[1] = _mm_srli_si128(res.val[0], 8);         //4,5, x,x
+    res.val[0] = _mm_unpacklo_epi32(val->val[0], res.val[0]);         //0,1,3,2
+    res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (1 << 2) | (3 << 4) | (2 << 6));         //0,1,2, 3
+    vst1q_u32(ptr, res.val[0]);         //store as 128 bit structure
+}
+#define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
+
+//void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val)
+{
+    __m128i tmp;
+    tmp =  _mm_unpacklo_epi64(val->val[0], val->val[1]);
+    vst1q_u64(ptr, tmp);         //store as 128 bit structure
+}
+#define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
+
+#if defined(USE_SSSE3)
+//void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val)  // VST3.8 {d0, d1, d2}, [r0]
+#define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
+
+//void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val)  // VST3.16 {d0, d1, d2}, [r0]
+#define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
+#endif
+
+//void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
+#define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
+
+//void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0]
+#define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
+
+//void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
+void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+//void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
+_NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val)
+{         //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
+    float32x2x3_t res;
+    res.val[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(_mm_castps_si128(val->val[1]), _mm_castps_si128(val->val[2])) );
+    res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(3,1,2,0));         //1,2,4,5
+    res.val[1] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(1,0,3,2));         //4,5, 1,2
+    res.val[0] = _mm_unpacklo_ps(val->val[0], res.val[0]);         //0,1,3, 2
+    res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(2,3,1,0));         //0,1,2, 3
+    vst1q_f32(ptr, res.val[0]);         //store as 128 bit structure
+}
+#define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
+
+#if defined(USE_SSSE3)
+//void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
+void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
+#define vst3_p8 vst3_u8
+
+//void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
+void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
+#define vst3_p16 vst3_s16
+#endif
+
+//***************  Quadruples store ********************************
+//*********************************************************************
+//void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
+{
+    __m128i tmp1, tmp2, res;
+    tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]);         //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
+    tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]);         //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
+    res = _mm_unpacklo_epi16(tmp1, tmp2);         //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
+    vst1q_u8(ptr,  res);
+    res = _mm_unpackhi_epi16(tmp1, tmp2);         //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
+    vst1q_u8((ptr + 16), res);
+    tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]);         //
+    tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]);         //
+    res = _mm_unpacklo_epi16(tmp1, tmp2);         //
+    vst1q_u8((ptr + 32), res);
+    res = _mm_unpackhi_epi16(tmp1, tmp2);         //
+    vst1q_u8((ptr + 48), res);
+}
+#define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
+
+//void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
+{
+    uint16x8x4_t v;
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
+    v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
+    tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
+    v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
+    vst1q_u16(ptr,     v.val[0]);
+    vst1q_u16((ptr + 8), v.val[1]);
+    vst1q_u16((ptr + 16),v.val[2]);
+    vst1q_u16((ptr + 24), v.val[3]);
+}
+#define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
+
+//void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
+{
+    uint16x8x4_t v;
+    __m128i tmp1, tmp2;
+    tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
+    v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
+    tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
+    tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
+    v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
+    v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
+    vst1q_u32(ptr,      v.val[0]);
+    vst1q_u32((ptr + 4),  v.val[1]);
+    vst1q_u32((ptr + 8),  v.val[2]);
+    vst1q_u32((ptr + 12), v.val[3]);
+}
+#define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
+
+//void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
+void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
+#define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
+
+//void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
+void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
+#define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
+
+//void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
+void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
+#define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
+
+//void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
+// IA32 SIMD doesn't work with 16bit floats currently
+
+//void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
+_NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
+{
+    __m128 tmp3, tmp2, tmp1, tmp0;
+    float32x4x4_t v;
+    tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
+    tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
+    tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
+    tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
+    v.val[0] = _mm_movelh_ps(tmp0, tmp2);
+    v.val[1] = _mm_movehl_ps(tmp2, tmp0);
+    v.val[2] = _mm_movelh_ps(tmp1, tmp3);
+    v.val[3] = _mm_movehl_ps(tmp3, tmp1);
+    vst1q_f32(ptr,   v.val[0]);
+    vst1q_f32((ptr + 4), v.val[1]);
+    vst1q_f32((ptr + 8), v.val[2]);
+    vst1q_f32((ptr + 12), v.val[3]);
+}
+#define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
+
+//void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
+void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
+#define vst4q_p8 vst4q_u8
+
+//void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
+void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
+#define vst4q_p16 vst4q_s16
+
+//void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val)
+{
+    uint8x8x4_t v;
+    __m128i sh0, sh1;
+    sh0 = _mm_unpacklo_epi8(val->val[0],val->val[1]);         // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
+    sh1 = _mm_unpacklo_epi8(val->val[2],val->val[3]);         // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
+    v.val[0] = _mm_unpacklo_epi16(sh0,sh1);         // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
+    v.val[2] = _mm_unpackhi_epi16(sh0,sh1);         //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
+    vst1q_u8(ptr,      v.val[0]);
+    vst1q_u8((ptr + 16),  v.val[2]);
+}
+#define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
+
+//void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val)
+{
+    uint16x4x4_t v;
+    __m128i sh0, sh1;
+    sh0 = _mm_unpacklo_epi16(val->val[0],val->val[1]);         //a0,a1,b0,b1,c0,c1,d0,d1,
+    sh1 = _mm_unpacklo_epi16(val->val[2],val->val[3]);         //a2,a3,b2,b3,c2,c3,d2,d3
+    v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         // a0,a1,a2,a3,b0,b1,b2,b3
+    v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         // c0,c1,c2,c3,d0,d1,d2,d3
+    vst1q_u16(ptr,      v.val[0]);         //store as 128 bit structure
+    vst1q_u16((ptr + 8),  v.val[2]);
+}
+#define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
+
+//void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val)
+{         //0,4,   1,5,  2,6,  3,7
+    uint32x2x4_t v;
+    __m128i sh0, sh1;
+    sh0 = _mm_unpacklo_epi32(val->val[0], val->val[1]);         //0,1,4,5
+    sh1 = _mm_unpacklo_epi32(val->val[2], val->val[3]);         //2,3,6,7
+    v.val[0] = _mm_unpacklo_epi64(sh0,sh1);         //
+    v.val[1] = _mm_unpackhi_epi64(sh0,sh1);         //
+    vst1q_u32(ptr,     v.val[0]);         //store as 128 bit structure
+    vst1q_u32((ptr + 4),  v.val[1]);
+}
+#define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
+
+//void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val)
+{
+    vst1q_u64(ptr,    val->val[0]);
+    vst1q_u64((ptr + 2), val->val[2]);
+}
+#define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
+
+//void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val)  //VST4.8 {d0, d1, d2, d3}, [r0]
+#define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
+
+//void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val)  // VST4.16 {d0, d1, d2, d3}, [r0]
+#define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
+
+//void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
+#define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
+
+//void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
+void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
+#define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
+
+//void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
+// IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
+
+//void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
+_NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val)
+{         //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
+    float32x2x4_t v;
+    v.val[0] = _mm_unpacklo_ps(val->val[0],val->val[1]);
+    v.val[2] = _mm_unpacklo_ps(val->val[2],val->val[3]);
+    v.val[1] = _mm_movelh_ps (v.val[0], v.val[2]);         //a0, c0, a1,c1,
+    v.val[3] = _mm_movehl_ps (v.val[2],v.val[0]);         //b0,d0, b1, d1
+    vst1q_f32(ptr,     v.val[1]);         //store as 128 bit structure
+    vst1q_f32((ptr + 4),  v.val[3]);
+}
+#define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
+
+//void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
+void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);
+#define vst4_p8 vst4_u8
+
+//void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
+void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);
+#define vst4_p16 vst4_u16
+
+//*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors  *********************
+//********************************************************************************************************************
+//void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
+{
+    vst1q_lane_s16(ptr, val->val[0], lane);
+    vst1q_lane_s16((ptr + 1), val->val[1], lane);
+}
+#define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
+
+//void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
+{
+    vst1q_lane_u32(ptr, val->val[0], lane);
+    vst1q_lane_u32((ptr + 1), val->val[1], lane);
+}
+#define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
+
+//void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
+void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
+#define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
+
+//void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
+void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
+#define vst2q_lane_s32(ptr, val, lane)  vst2q_lane_u32((uint32_t*)ptr, val, lane)
+
+//void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
+void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
+//current IA SIMD doesn't support float16
+
+//void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
+_NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
+{
+    vst1q_lane_f32(ptr, val->val[0], lane);
+    vst1q_lane_f32((ptr + 1), val->val[1], lane);
+}
+#define vst2q_lane_f32(ptr, val, lane) vst2q_lane_f32_ptr(ptr, &val, lane)
+
+//void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
+void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
+#define vst2q_lane_p16 vst2q_lane_s16
+
+//void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
+void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16 {d0[0], d1[0]}, [r0]
+#define vst2_lane_u16 vst2q_lane_u16
+
+//void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
+void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32 {d0[0], d1[0]}, [r0]
+#define vst2_lane_u32 vst2q_lane_u32
+
+//void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
+void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);
+#define vst2_lane_s8(ptr, val, lane)  vst2_lane_u8((uint8_t*)ptr, val, lane)
+
+//void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
+void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);
+#define vst2_lane_s16 vst2q_lane_s16
+
+//void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
+void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);
+#define vst2_lane_s32 vst2q_lane_s32
+
+//void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
+//current IA SIMD doesn't support float16
+
+void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32 {d0[0], d1[0]}, [r0]
+#define vst2_lane_f32 vst2q_lane_f32
+
+//void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
+#define vst2_lane_p8 vst2_lane_u8
+
+//void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
+#define vst2_lane_p16 vst2_lane_u16
+
+//************************* Triple lanes  stores *******************************************************
+//*******************************************************************************************************
+//void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
+{
+    vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
+    vst1q_lane_u16((ptr + 2), val->val[2], lane);
+}
+#define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
+
+//void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
+{
+    vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
+    vst1q_lane_u32((ptr + 2), val->val[2], lane);
+}
+#define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
+
+//void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
+#define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
+
+//void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
+#define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
+
+//void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
+void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
+//current IA SIMD doesn't support float16
+
+//void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
+_NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
+{
+    vst1q_lane_f32(ptr,   val->val[0], lane);
+    vst1q_lane_f32((ptr + 1),   val->val[1], lane);
+    vst1q_lane_f32((ptr + 2), val->val[2], lane);
+}
+#define vst3q_lane_f32(ptr, val, lane) vst3q_lane_f32_ptr(ptr, &val, lane)
+
+//void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);
+#define  vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
+
+//void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);
+#define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
+
+//void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);
+#define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
+
+//void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
+//current IA SIMD doesn't support float16
+
+//void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);
+#define vst3_lane_f32 vst3q_lane_f32
+
+//void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);
+#define vst3_lane_p8 vst3_lane_u8
+
+//void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
+void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);
+#define vst3_lane_p16 vst3_lane_s16
+
+//******************************** Quadruple lanes stores ***********************************************
+//*******************************************************************************************************
+//void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
+{
+    vst2q_lane_u16_ptr(ptr,    (uint16x8x2_t*)val4->val, lane);
+    vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
+}
+#define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
+
+//void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
+{
+    vst2q_lane_u32_ptr(ptr,     (uint32x4x2_t*)val4->val, lane);
+    vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
+}
+#define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
+
+//void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
+#define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
+
+//void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
+#define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
+
+//void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
+//current IA SIMD doesn't support float16
+
+//void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+_NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
+{
+    vst1q_lane_f32(ptr,   val->val[0], lane);
+    vst1q_lane_f32((ptr + 1), val->val[1], lane);
+    vst1q_lane_f32((ptr + 2), val->val[2], lane);
+    vst1q_lane_f32((ptr + 3), val->val[3], lane);
+}
+#define vst4q_lane_f32(ptr, val, lane) vst4q_lane_f32_ptr(ptr, &val, lane)
+
+//void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
+void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
+#define vst4q_lane_p16 vst4q_lane_u16
+
+//void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane)
+{
+    vst1q_lane_u8(ptr,   val->val[0], lane);
+    vst1q_lane_u8((ptr + 1),  val->val[1], lane);
+    vst1q_lane_u8((ptr + 2), val->val[2], lane);
+    vst1q_lane_u8((ptr + 3), val->val[3], lane);
+}
+#define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
+
+//void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane)
+{
+    vst1q_lane_u16(ptr,   val->val[0], lane);
+    vst1q_lane_u16((ptr + 1),val->val[1], lane);
+    vst1q_lane_u16((ptr + 2), val->val[2], lane);
+    vst1q_lane_u16((ptr + 3), val->val[3], lane);
+}
+#define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
+
+//void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+_NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane)
+{
+    vst1q_lane_u32(ptr,   val->val[0], lane);
+    vst1q_lane_u32((ptr + 1), val->val[1], lane);
+    vst1q_lane_u32((ptr + 2), val->val[2], lane);
+    vst1q_lane_u32((ptr + 3), val->val[3], lane);
+
+}
+#define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
+
+//void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
+
+//void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
+
+//void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
+
+//void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
+//current IA SIMD doesn't support float16
+
+//void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+#define vst4_lane_f32 vst4q_lane_f32
+
+//void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);
+#define vst4_lane_p8 vst4_lane_u8
+
+//void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
+void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);
+#define vst4_lane_p16 vst4_lane_u16
+
+//**************************************************************************************************
+//************************ Extract lanes from a vector ********************************************
+//**************************************************************************************************
+//These intrinsics extract a single lane (element) from a vector.
+
+uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
+#define vgetq_lane_u8 _MM_EXTRACT_EPI8
+
+uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.s16 r0, d0[0]
+#define  vgetq_lane_u16 _MM_EXTRACT_EPI16
+
+uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+#define vgetq_lane_u32 _MM_EXTRACT_EPI32
+
+int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane);         // VMOV.S8 r0, d0[0]
+#define vgetq_lane_s8 vgetq_lane_u8
+
+int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane);         // VMOV.S16 r0, d0[0]
+#define vgetq_lane_s16 vgetq_lane_u16
+
+int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+#define vgetq_lane_s32 vgetq_lane_u32
+
+poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
+#define vgetq_lane_p8 vgetq_lane_u8
+
+poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.s16 r0, d0[0]
+#define vgetq_lane_p16 vgetq_lane_u16
+
+float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
+_NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
+{
+    int32_t ilane;
+    ilane = _MM_EXTRACT_PS(vec,lane);
+    return *(float*)&ilane;
+}
+
+int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+#define vgetq_lane_s64 (int64_t) vgetq_lane_u64
+
+uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
+#define vgetq_lane_u64 _MM_EXTRACT_EPI64
+
+// ***************** Set lanes within a vector ********************************************
+// **************************************************************************************
+//These intrinsics set a single lane (element) within a vector.
+//same functions as vld1_lane_xx ones, but take the value to be set directly.
+
+uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
+{
+    uint8_t val;
+    val = value;
+    return vld1q_lane_u8(&val, vec,  lane);
+}
+
+uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
+{
+    uint16_t val;
+    val = value;
+    return vld1q_lane_u16(&val, vec,  lane);
+}
+
+uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
+{
+    uint32_t val;
+    val = value;
+    return vld1q_lane_u32(&val, vec,  lane);
+}
+
+int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+_NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
+{
+    int8_t val;
+    val = value;
+    return vld1q_lane_s8(&val, vec,  lane);
+}
+
+int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+_NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
+{
+    int16_t val;
+    val = value;
+    return vld1q_lane_s16(&val, vec,  lane);
+}
+
+int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
+{
+    int32_t val;
+    val = value;
+    return vld1q_lane_s32(&val, vec,  lane);
+}
+
+poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
+#define vsetq_lane_p8 vsetq_lane_u8
+
+poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
+#define vsetq_lane_p16 vsetq_lane_u16
+
+float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
+_NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
+{
+    float32_t val;
+    val = value;
+}
+
+int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
+{
+    uint64_t val;
+    val = value;
+    return vld1q_lane_s64(&val, vec,  lane);
+}
+
+uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
+#define vsetq_lane_u64 vsetq_lane_s64
+
+// *******************************************************************************
+// **************** Initialize a vector from bit pattern ***************************
+// *******************************************************************************
+//These intrinsics create a vector from a literal bit pattern.
+
+//no IA32 SIMD avalilable
+
+//********************* Set all lanes to same value ********************************
+//*********************************************************************************
+//These intrinsics set all lanes to the same value.
+
+uint8x16_t   vdupq_n_u8(uint8_t value);         // VDUP.8 q0,r0
+#define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
+
+uint16x8_t   vdupq_n_u16(uint16_t value);         // VDUP.16 q0,r0
+#define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
+
+uint32x4_t   vdupq_n_u32(uint32_t value);         // VDUP.32 q0,r0
+#define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
+
+int8x16_t   vdupq_n_s8(int8_t value);         // VDUP.8 q0,r0
+#define vdupq_n_s8 _mm_set1_epi8
+
+int16x8_t   vdupq_n_s16(int16_t value);         // VDUP.16 q0,r0
+#define vdupq_n_s16 _mm_set1_epi16
+
+int32x4_t   vdupq_n_s32(int32_t value);         // VDUP.32 q0,r0
+#define vdupq_n_s32 _mm_set1_epi32
+
+poly8x16_t vdupq_n_p8(poly8_t value);         // VDUP.8 q0,r0
+#define  vdupq_n_p8 vdupq_n_u8
+
+poly16x8_t vdupq_n_p16(poly16_t value);         // VDUP.16 q0,r0
+#define  vdupq_n_p16 vdupq_n_u16
+
+float32x4_t vdupq_n_f32(float32_t value);         // VDUP.32 q0,r0
+#define vdupq_n_f32 _mm_set1_ps
+
+int64x2_t   vdupq_n_s64(int64_t value);         // VMOV d0,r0,r0
+_NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
+{
+    _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value};         //value may be an immediate
+    return LOAD_SI128(value2);
+}
+
+uint64x2_t   vdupq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+_NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
+{
+    _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value};         //value may be an immediate
+    return LOAD_SI128(val);
+}
+
+//****  Set all lanes to same value  ************************
+//Same functions as above - just aliaces.********************
+//Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
+
+uint8x16_t vmovq_n_u8(uint8_t value);         // VDUP.8 q0,r0
+#define vmovq_n_u8 vdupq_n_u8
+
+uint16x8_t vmovq_n_u16(uint16_t value);         // VDUP.16 q0,r0
+#define vmovq_n_u16 vdupq_n_s16
+
+uint32x4_t vmovq_n_u32(uint32_t value);         // VDUP.32 q0,r0
+#define vmovq_n_u32 vdupq_n_u32
+
+int8x16_t vmovq_n_s8(int8_t value);         // VDUP.8 q0,r0
+#define vmovq_n_s8 vdupq_n_s8
+
+int16x8_t vmovq_n_s16(int16_t value);         // VDUP.16 q0,r0
+#define vmovq_n_s16 vdupq_n_s16
+
+int32x4_t vmovq_n_s32(int32_t value);         // VDUP.32 q0,r0
+#define vmovq_n_s32 vdupq_n_s32
+
+poly8x16_t vmovq_n_p8(poly8_t value);         // VDUP.8 q0,r0
+#define vmovq_n_p8 vdupq_n_u8
+
+poly16x8_t vmovq_n_p16(poly16_t value);         // VDUP.16 q0,r0
+#define vmovq_n_p16 vdupq_n_s16
+
+float32x4_t vmovq_n_f32(float32_t value);         // VDUP.32 q0,r0
+#define vmovq_n_f32 vdupq_n_f32
+
+int64x2_t vmovq_n_s64(int64_t value);         // VMOV d0,r0,r0
+#define vmovq_n_s64 vdupq_n_s64
+
+uint64x2_t vmovq_n_u64(uint64_t value);         // VMOV d0,r0,r0
+#define vmovq_n_u64 vdupq_n_u64
+
+//**************Set all lanes to the value of one lane of a vector *************
+//****************************************************************************
+//here shuffle is better solution than lane extraction followed by set1 function
+
+// ********************************************************************
+// ********************  Combining vectors *****************************
+// ********************************************************************
+//These intrinsics join two 64 bit vectors into a single 128bit vector.
+
+//current IA SIMD doesn't support float16
+
+//**********************************************************************
+//************************* Splitting vectors **************************
+//**********************************************************************
+//**************** Get high part ******************************************
+//These intrinsics split a 128 bit vector into 2 component 64 bit vectors
+
+// IA32 SIMD doesn't work with 16bit floats currently
+
+//********************** Get low part **********************
+//**********************************************************
+
+// IA32 SIMD doesn't work with 16bit floats currently
+
+//**************************************************************************
+//************************ Converting vectors **********************************
+//**************************************************************************
+//************* Convert from float ***************************************
+// need to set _MM_SET_ROUNDING_MODE ( x) accordingly
+
+int32x4_t   vcvtq_s32_f32(float32x4_t a);         // VCVT.S32.F32 q0, q0
+#define vcvtq_s32_f32 _mm_cvtps_epi32
+
+uint32x4_t vcvtq_u32_f32(float32x4_t a);         // VCVT.U32.F32 q0, q0
+_NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a)         // VCVT.U32.F32 q0, q0
+{         //No single instruction SSE solution  but we could implement it as following:
+    __m128i resi;
+    __m128 zero,  mask, a_pos, mask_f_max_si, res;
+    _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    zero = _mm_setzero_ps();
+    mask = _mm_cmpgt_ps(a, zero);
+    a_pos = _mm_and_ps(a, mask);
+    mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff);
+    res =  _mm_sub_ps(a_pos, mask_f_max_si);         //if the input fits to signed we don't subtract anything
+    resi = _mm_cvtps_epi32(res);
+    return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si);
+}
+
+// ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
+//*************************************************************************************************
+//Intel SIMD doesn't support fixed point
+
+int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.S32.F32 q0, q0, #32
+uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.U32.F32 q0, q0, #32
+
+//***************** Convert to float *************************
+//*************************************************************
+
+float32x4_t vcvtq_f32_s32(int32x4_t a);         // VCVT.F32.S32 q0, q0
+#define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
+
+float32x4_t vcvtq_f32_u32(uint32x4_t a);         // VCVT.F32.U32 q0, q0
+_NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a)         // VCVT.F32.U32 q0, q0
+{         //solution may be not optimal
+    __m128 two16, fHi, fLo;
+    __m128i hi, lo;
+    two16 = _mm_set1_ps((float)0x10000);         //2^16
+    // Avoid double rounding by doing two exact conversions
+    // of high and low 16-bit segments
+    hi = _mm_srli_epi32(a, 16);
+    lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
+    fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
+    fLo = _mm_cvtepi32_ps(lo);
+    // do single rounding according to current rounding mode
+    return _mm_add_ps(fHi, fLo);
+}
+
+//**************Convert between floats ***********************
+//************************************************************
+
+//Intel SIMD doesn't support 16bits floats curently
+
+//Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
+
+//************Vector narrow integer conversion (truncation) ******************
+//****************************************************************************
+
+//**************** Vector long move   ***********************
+//***********************************************************
+
+//*************Vector saturating narrow integer*****************
+//**************************************************************
+
+//************* Vector saturating narrow integer signed->unsigned **************
+//*****************************************************************************
+
+// ********************************************************
+// **************** Table look up **************************
+// ********************************************************
+//VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
+//in a table and generate a new vector. Indexes out of range return 0.
+//for Intel SIMD we need to set the MSB to 1 for zero return
+
+//Special trick to avoid __declspec(align('8')) won't be aligned" error
+
+//Special trick to avoid __declspec(align('16')) won't be aligned" error
+
+//****************** Extended table look up intrinsics ***************************
+//**********************************************************************************
+//VTBX (Vector Table Extension) works in the same way as VTBL do,
+// except that indexes out of range leave the destination element unchanged.
+
+//Special trick to avoid __declspec(align('8')) won't be aligned" error
+
+//*************************************************************************************************
+// *************************** Operations with a scalar value *********************************
+//*************************************************************************************************
+
+//******* Vector multiply accumulate by scalar *************************************************
+//**********************************************************************************************
+
+//***************** Vector widening multiply accumulate by scalar **********************
+//***************************************************************************************
+
+// ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
+// ************************************************************************************************
+
+// ****** Vector multiply subtract by scalar *****************
+// *************************************************************
+
+// **** Vector widening multiply subtract by scalar ****
+// ****************************************************
+
+//********* Vector widening saturating doubling multiply subtract by scalar **************************
+//******************************************************************************************************
+
+//********** Vector multiply with scalar *****************************
+
+int16x8_t vmulq_n_s16(int16x8_t a, int16_t b);         // VMUL.I16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b)         // VMUL.I16 q0,q0,d0[0]
+{
+    int16x8_t b16x8;
+    b16x8 = vdupq_n_s16(b);
+    return vmulq_s16(a, b16x8);
+}
+
+int32x4_t vmulq_n_s32(int32x4_t a, int32_t b);         // VMUL.I32 q0,q0,d0[0]
+_NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b)         // VMUL.I32 q0,q0,d0[0]
+{
+    int32x4_t b32x4;
+    b32x4 = vdupq_n_s32(b);
+    return vmulq_s32(a, b32x4);
+}
+
+float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);         // VMUL.F32 q0,q0,d0[0]
+_NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b)         // VMUL.F32 q0,q0,d0[0]
+{
+    float32x4_t b32x4;
+    b32x4 = vdupq_n_f32(b);
+    return vmulq_f32(a, b32x4);
+}
+
+uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b);         // VMUL.I16 q0,q0,d0[0]
+_NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b)         // VMUL.I16 q0,q0,d0[0]
+{
+    uint16x8_t b16x8;
+    b16x8 = vdupq_n_s16(b);
+    return vmulq_s16(a, b16x8);
+}
+
+uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b);         // VMUL.I32 q0,q0,d0[0]
+_NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b)         // VMUL.I32 q0,q0,d0[0]
+{
+    uint32x4_t b32x4;
+    b32x4 = vdupq_n_u32(b);
+    return vmulq_u32(a, b32x4);
+}
+
+//**** Vector long multiply with scalar ************
+
+//**** Vector long multiply by scalar ****
+
+//********* Vector saturating doubling long multiply with scalar  *******************
+
+//************* Vector saturating doubling long multiply by scalar ***********************************************
+
+// *****Vector saturating doubling multiply high with scalar *****
+
+int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);         //  VQDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2)         //  VQDMULH.S16 q0,q0,d0[0]
+{         //solution may be not optimal
+    int16x8_t scalar;
+    scalar = vdupq_n_s16(val2);
+    return vqdmulhq_s16(vec1, scalar);
+}
+
+int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);         //  VQDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32x4_t scalar;
+    scalar = vdupq_n_s32(val2);
+    return vqdmulhq_s32(vec1, scalar);
+}
+
+//***** Vector saturating doubling multiply high by scalar ****************
+
+//******** Vector saturating rounding doubling multiply high with scalar ***
+
+#if defined(USE_SSSE3)
+int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQRDMULH.S16 q0,q0,d0[0]
+_NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2)         // VQRDMULH.S16 q0,q0,d0[0]
+{         //solution may be not optimal
+    int16x8_t scalar;
+    scalar = vdupq_n_s16(val2);
+    return vqrdmulhq_s16(vec1, scalar);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQRDMULH.S32 q0,q0,d0[0]
+_NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
+{
+    int32x4_t scalar;
+    scalar = vdupq_n_s32(val2);
+    return vqrdmulhq_s32(vec1, scalar);
+}
+#endif
+
+//********* Vector rounding saturating doubling multiply high by scalar  ****
+
+//**************Vector multiply accumulate with scalar *******************
+
+int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLA.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c)         // VMLA.I16 q0, q0, d0[0]
+{
+    int16x8_t scalar;
+    scalar = vdupq_n_s16(c);
+    return vmlaq_s16(a,b,scalar);
+}
+
+int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLA.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c)         // VMLA.I32 q0, q0, d0[0]
+{
+    int32x4_t scalar;
+    scalar = vdupq_n_s32(c);
+    return vmlaq_s32(a,b,scalar);
+}
+
+uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLA.I16 q0, q0, d0[0]
+#define vmlaq_n_u16 vmlaq_n_s16
+
+uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLA.I32 q0, q0, d0[0]
+#define vmlaq_n_u32 vmlaq_n_s32
+
+float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLA.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c)         // VMLA.F32 q0, q0, d0[0]
+{
+    float32x4_t scalar;
+    scalar = vdupq_n_f32(c);
+    return vmlaq_f32(a,b,scalar);
+}
+
+//************Vector widening multiply accumulate with scalar****************************
+
+//************ Vector widening saturating doubling multiply accumulate with scalar **************
+
+//******** Vector multiply subtract with scalar **************
+
+int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c)         // VMLS.I16 q0, q0, d0[0]
+{
+    int16x8_t vc;
+    vc = vdupq_n_s16(c);
+    return vmlsq_s16(a, b,vc);
+}
+
+int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c)         // VMLS.I32 q0, q0, d0[0]
+{
+    int32x4_t vc;
+    vc = vdupq_n_s32(c);
+    return vmlsq_s32(a,b,vc);
+}
+
+uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLS.I16 q0, q0, d0[0]
+_NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c)         // VMLS.I16 q0, q0, d0[0]
+{
+    uint32x4_t vc;
+    vc = vdupq_n_u32(c);
+    return vmlsq_u32(a,b,vc);
+}
+
+uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLS.I32 q0, q0, d0[0]
+_NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c)         // VMLS.I32 q0, q0, d0[0]
+{
+    uint32x4_t vc;
+    vc = vdupq_n_u32(c);
+    return vmlsq_u32(a,b,vc);
+}
+
+float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLS.F32 q0, q0, d0[0]
+_NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
+{
+    float32x4_t vc;
+    vc = vdupq_n_f32(c);
+    return vmlsq_f32(a,b,vc);
+}
+
+//**** Vector widening multiply subtract with scalar ******
+
+//***** Vector widening saturating doubling multiply subtract with scalar *********
+//**********************************************************************************
+
+//*******************  Vector extract ***********************************************
+//*************************************************************************************
+//VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
+//vector and the top end of the first, concatenates them, and places the result in the destination vector
+//c elements from the bottom end of the second operand and (8-c) from the top end of the first
+
+#if defined(USE_SSSE3)
+//same result tested
+
+#endif
+
+#if defined(USE_SSSE3)
+int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+#define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
+
+uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+#define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
+
+poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
+#define vextq_p8 vextq_s8
+
+int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+#define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
+
+uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+#define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
+
+poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
+#define vextq_p16 vextq_s16
+#endif
+
+#if defined(USE_SSSE3)
+int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
+#define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
+
+uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
+#define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
+
+int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+#define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
+
+uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
+#define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
+#endif
+
+//************ Reverse vector elements (swap endianness)*****************
+//*************************************************************************
+//VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
+
+#if defined(USE_SSSE3)
+int8x16_t vrev64q_s8(int8x16_t vec);         // VREV64.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec)         // VREV64.8 q0,q0
+{
+    _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8_t vrev64q_s16(int16x8_t vec);         // VREV64.16 q0,q0
+_NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec)         // VREV64.16 q0,q0
+{         //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
+    _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
+}
+#endif
+
+int32x4_t vrev64q_s32(int32x4_t vec);         // VREV64.32 q0,q0
+_NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec)         // VREV64.32 q0,q0
+{
+    return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
+}
+
+#if defined(USE_SSSE3)
+uint8x16_t vrev64q_u8(uint8x16_t vec);         // VREV64.8 q0,q0
+#define vrev64q_u8 vrev64q_s8
+
+uint16x8_t vrev64q_u16(uint16x8_t vec);         // VREV64.16 q0,q0
+#define vrev64q_u16 vrev64q_s16
+#endif
+
+uint32x4_t vrev64q_u32(uint32x4_t vec);         // VREV64.32 q0,q0
+#define vrev64q_u32 vrev64q_s32
+
+#if defined(USE_SSSE3)
+poly8x16_t vrev64q_p8(poly8x16_t vec);         // VREV64.8 q0,q0
+#define vrev64q_p8 vrev64q_u8
+
+poly16x8_t vrev64q_p16(poly16x8_t vec);         // VREV64.16 q0,q0
+#define vrev64q_p16 vrev64q_s16
+#endif
+
+float32x4_t vrev64q_f32(float32x4_t vec);         // VREV64.32 q0,q0
+#define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
+
+//********************  32 bit shuffles **********************
+//************************************************************
+
+#if defined(USE_SSSE3)
+int8x16_t vrev32q_s8(int8x16_t vec);         // VREV32.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec)         // VREV32.8 q0,q0
+{
+    _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8_t vrev32q_s16(int16x8_t vec);         // VREV32.16 q0,q0
+_NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec)         // VREV32.16 q0,q0
+{
+    _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
+}
+#endif
+
+#if defined(USE_SSSE3)
+uint8x16_t vrev32q_u8(uint8x16_t vec);         // VREV32.8 q0,q0
+#define vrev32q_u8 vrev32q_s8
+
+uint16x8_t vrev32q_u16(uint16x8_t vec);         // VREV32.16 q0,q0
+#define vrev32q_u16 vrev32q_s16
+
+poly8x16_t vrev32q_p8(poly8x16_t vec);         // VREV32.8 q0,q0
+#define vrev32q_p8 vrev32q_u8
+#endif
+
+//*************  16 bit shuffles **********************
+//******************************************************
+
+#if defined(USE_SSSE3)
+int8x16_t vrev16q_s8(int8x16_t vec);         // VREV16.8 q0,q0
+_NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec)         // VREV16.8 q0,q0
+{
+    _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
+    return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
+}
+#endif
+
+#if defined(USE_SSSE3)
+uint8x16_t vrev16q_u8(uint8x16_t vec);         // VREV16.8 q0,q0
+#define vrev16q_u8 vrev16q_s8
+
+poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
+#define vrev16q_p8 vrev16q_u8
+#endif
+
+//*********************************************************************
+//**************** Other single operand arithmetic *******************
+//*********************************************************************
+
+//*********** Absolute: Vd[i] = |Va[i]| **********************************
+//************************************************************************
+
+int8x16_t   vabsq_s8(int8x16_t a);         // VABS.S8 q0,q0
+#define vabsq_s8 _mm_abs_epi8
+
+int16x8_t   vabsq_s16(int16x8_t a);         // VABS.S16 q0,q0
+#define vabsq_s16 _mm_abs_epi16
+
+int32x4_t   vabsq_s32(int32x4_t a);         // VABS.S32 q0,q0
+#define vabsq_s32 _mm_abs_epi32
+
+float32x4_t vabsq_f32(float32x4_t a);         // VABS.F32 q0,q0
+_NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a)         // VABS.F32 q0,q0
+{
+    _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
+    return _mm_and_ps (a, *(__m128*)c7fffffff);
+}
+
+//****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
+//**********************************************************************
+//For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
+
+#if defined(USE_SSSE3)
+int8x16_t vqabsq_s8(int8x16_t a);         // VQABS.S8 q0,q0
+_NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a)         // VQABS.S8 q0,q0
+{
+    __m128i c_128, abs, abs_cmp;
+    c_128 = _mm_set1_epi8 (0x80);         //-128
+    abs = _mm_abs_epi8 (a);
+    abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
+    return _mm_xor_si128 (abs,  abs_cmp);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8_t vqabsq_s16(int16x8_t a);         // VQABS.S16 q0,q0
+_NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a)         // VQABS.S16 q0,q0
+{
+    __m128i c_32768, abs, abs_cmp;
+    c_32768 = _mm_set1_epi16 (0x8000);         //-32768
+    abs = _mm_abs_epi16 (a);
+    abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
+    return _mm_xor_si128 (abs,  abs_cmp);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int32x4_t vqabsq_s32(int32x4_t a);         // VQABS.S32 q0,q0
+_NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a)         // VQABS.S32 q0,q0
+{
+    __m128i c80000000, abs, abs_cmp;
+    c80000000 = _mm_set1_epi32 (0x80000000);         //most negative value
+    abs = _mm_abs_epi32 (a);
+    abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
+    return _mm_xor_si128 (abs,  abs_cmp);
+}
+#endif
+
+//*************** Negate: Vd[i] = - Va[i] *************************************
+//*****************************************************************************
+//several Negate implementations possible for SIMD.
+//e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
+
+int8x16_t vnegq_s8(int8x16_t a);         // VNE//q0,q0
+_NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a)         // VNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_sub_epi8 (zero, a);
+}         //or _mm_sign_epi8 (a, negative numbers vector)
+
+int16x8_t vnegq_s16(int16x8_t a);         // VNE//q0,q0
+_NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a)         // VNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_sub_epi16 (zero, a);
+}         //or _mm_sign_epi16 (a, negative numbers vector)
+
+int32x4_t vnegq_s32(int32x4_t a);         // VNE//q0,q0
+_NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a)         // VNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_sub_epi32 (zero, a);
+}         //or _mm_sign_epi32 (a, negative numbers vector)
+
+float32x4_t vnegq_f32(float32x4_t a);         // VNE//q0,q0
+_NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a)         // VNE//q0,q0
+{
+    _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+    return _mm_xor_ps (a, *(__m128*) c80000000);
+}
+
+//************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
+//***************************************************************************************
+//For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
+
+int8x16_t vqnegq_s8(int8x16_t a);         // VQNE//q0,q0
+_NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a)         // VQNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_subs_epi8 (zero, a);         //saturating substraction
+}
+
+int16x8_t vqnegq_s16(int16x8_t a);         // VQNE//q0,q0
+_NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a)         // VQNE//q0,q0
+{
+    __m128i zero;
+    zero = _mm_setzero_si128 ();
+    return _mm_subs_epi16 (zero, a);         //saturating substraction
+}
+
+int32x4_t vqnegq_s32(int32x4_t a);         // VQNE//q0,q0
+_NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a)         // VQNE//q0,q0
+{         //solution may be not optimal compared with a serial
+    __m128i c80000000, zero, sub, cmp;
+    c80000000 = _mm_set1_epi32 (0x80000000);         //most negative value
+    zero = _mm_setzero_si128 ();
+    sub =  _mm_sub_epi32 (zero, a);         //substraction
+    cmp = _mm_cmpeq_epi32 (a, c80000000);
+    return _mm_xor_si128 (sub,  cmp);
+}
+
+//****************** Count leading zeros ********************************
+//**************************************************************************
+//no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
+
+#if defined(USE_SSSE3)
+int8x16_t vclzq_s8(int8x16_t a);         // VCLZ.I8 q0,q0
+_NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
+{
+    _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
+                                       /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
+                                       /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
+                                       /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0};
+    __m128i maskLOW, c4, lowclz, mask, hiclz;
+    maskLOW = _mm_set1_epi8(0x0f);         //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
+    c4 = _mm_set1_epi8(4);
+    lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a);         //uses low 4 bits anyway
+    mask =  _mm_srli_epi16(a, 4);         //get high 4 bits as low bits
+    mask = _mm_and_si128(mask, maskLOW);         //low 4 bits, need masking to avoid zero if MSB is set
+    hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask);         //uses low 4 bits anyway
+    mask = _mm_cmpeq_epi8(hiclz, c4);         // shows the need to add lowclz zeros
+    lowclz = _mm_and_si128(lowclz,mask);
+    return _mm_add_epi8(lowclz, hiclz);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8_t vclzq_s16(int16x8_t a);         // VCLZ.I16 q0,q0
+_NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
+{
+    __m128i c7, res8x16, res8x16_swap;
+    _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
+    _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
+    c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5);         //7
+    res8x16 = vclzq_s8(a);
+    res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab);         //horisontal pairs swap
+    res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit);         //lowclz
+    res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit);         //hiclz
+    c7 = _mm_cmpgt_epi16(res8x16_swap, c7);         // shows the need to add lowclz zeros
+    res8x16 = _mm_and_si128(res8x16, c7);         //lowclz
+    return _mm_add_epi16(res8x16_swap, res8x16);
+}
+#endif
+
+int32x4_t vclzq_s32(int32x4_t a);         // VCLZ.I32 q0,q0
+_NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
+{
+    __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
+    c55555555 = _mm_set1_epi32(0x55555555);
+    c33333333 = _mm_set1_epi32(0x33333333);
+    c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
+    c3f = _mm_set1_epi32(0x3f);
+    c32 = _mm_set1_epi32(32);
+    tmp = _mm_srli_epi32(a, 1);
+    res = _mm_or_si128(tmp, a);         //atmp[i] |= (atmp[i] >> 1);
+    tmp = _mm_srli_epi32(res, 2);
+    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 2);
+    tmp = _mm_srli_epi32(res, 4);
+    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 4);
+    tmp = _mm_srli_epi32(res, 8);
+    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 8);
+    tmp = _mm_srli_epi32(res, 16);
+    res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 16);
+
+    tmp = _mm_srli_epi32(res, 1);
+    tmp = _mm_and_si128(tmp, c55555555);
+    res = _mm_sub_epi32(res, tmp);         //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
+
+    tmp = _mm_srli_epi32(res, 2);
+    tmp = _mm_and_si128(tmp, c33333333);
+    tmp1 = _mm_and_si128(res, c33333333);
+    res = _mm_add_epi32(tmp, tmp1);         //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
+
+    tmp = _mm_srli_epi32(res, 4);
+    tmp = _mm_add_epi32(tmp, res);
+    res = _mm_and_si128(tmp, c0f0f0f0f);         //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
+
+    tmp = _mm_srli_epi32(res, 8);
+    res = _mm_add_epi32(tmp, res);         //atmp[i] += (atmp[i] >> 8);
+
+    tmp = _mm_srli_epi32(res, 16);
+    res = _mm_add_epi32(tmp, res);         //atmp[i] += (atmp[i] >> 16);
+
+    res = _mm_and_si128(res, c3f);         //atmp[i] = atmp[i] & 0x0000003f;
+
+    return _mm_sub_epi32(c32, res);         //res[i] = 32 - atmp[i];
+}
+
+#if defined(USE_SSSE3)
+uint8x16_t vclzq_u8(uint8x16_t a);         // VCLZ.I8 q0,q0
+#define vclzq_u8 vclzq_s8
+
+uint16x8_t vclzq_u16(uint16x8_t a);         // VCLZ.I16 q0,q0
+#define vclzq_u16 vclzq_s16
+#endif
+
+uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
+#define vclzq_u32 vclzq_s32
+
+//************** Count leading sign bits **************************
+//********************************************************************
+//VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
+// the topmost bit, that are the same as the topmost bit, in each element in a vector
+//No corresponding vector intrinsics in IA32, need to implement it.
+//While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
+
+#if defined(USE_SSSE3)
+int8x16_t vclsq_s8(int8x16_t a);         // VCLS.S8 q0,q0
+_NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
+{
+    __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
+    cff = _mm_cmpeq_epi8 (a,a);         //0xff
+    c80 = _mm_set1_epi8(0x80);
+    c1 = _mm_set1_epi8(1);
+    a_mask = _mm_and_si128(a, c80);
+    a_mask = _mm_cmpeq_epi8(a_mask, c80);         //0xff if negative input and 0 if positive
+    a_neg = _mm_xor_si128(a, cff);
+    a_neg = _mm_and_si128(a_mask, a_neg);
+    a_pos = _mm_andnot_si128(a_mask, a);
+    a_comb = _mm_or_si128(a_pos, a_neg);
+    a_comb = vclzq_s8(a_comb);
+    return _mm_sub_epi8(a_comb, c1);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8_t vclsq_s16(int16x8_t a);         // VCLS.S16 q0,q0
+_NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
+{
+    __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
+    cffff = _mm_cmpeq_epi16(a,a);
+    c8000 =  _mm_slli_epi16(cffff, 15);         //0x8000
+    c1 = _mm_srli_epi16(cffff,15);         //0x1
+    a_mask = _mm_and_si128(a, c8000);
+    a_mask = _mm_cmpeq_epi16(a_mask, c8000);         //0xffff if negative input and 0 if positive
+    a_neg = _mm_xor_si128(a, cffff);
+    a_neg = _mm_and_si128(a_mask, a_neg);
+    a_pos = _mm_andnot_si128(a_mask, a);
+    a_comb = _mm_or_si128(a_pos, a_neg);
+    a_comb = vclzq_s16(a_comb);
+    return _mm_sub_epi16(a_comb, c1);
+}
+#endif
+
+int32x4_t vclsq_s32(int32x4_t a);         // VCLS.S32 q0,q0
+_NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
+{
+    __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
+    cffffffff = _mm_cmpeq_epi32(a,a);
+    c80000000 =  _mm_slli_epi32(cffffffff, 31);         //0x80000000
+    c1 = _mm_srli_epi32(cffffffff,31);         //0x1
+    a_mask = _mm_and_si128(a, c80000000);
+    a_mask = _mm_cmpeq_epi32(a_mask, c80000000);         //0xffffffff if negative input and 0 if positive
+    a_neg = _mm_xor_si128(a, cffffffff);
+    a_neg = _mm_and_si128(a_mask, a_neg);
+    a_pos = _mm_andnot_si128(a_mask, a);
+    a_comb = _mm_or_si128(a_pos, a_neg);
+    a_comb = vclzq_s32(a_comb);
+    return _mm_sub_epi32(a_comb, c1);
+}
+
+//************************* Count number of set bits   ********************************
+//*************************************************************************************
+//No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
+//another option is to do the following algorithm:
+
+#if defined(USE_SSSE3)
+uint8x16_t vcntq_u8(uint8x16_t a);         // VCNT.8 q0,q0
+_NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
+{
+    _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
+                                            /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
+                                            /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
+                                            /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
+    __m128i maskLOW, mask, lowpopcnt, hipopcnt;
+    maskLOW = _mm_set1_epi8(0x0f);         //low 4 bits, need masking to avoid zero if MSB is set
+    mask = _mm_and_si128(a, maskLOW);
+    lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask);         //uses low 4 bits anyway
+    mask =  _mm_srli_epi16(a, 4);         //get high 4 bits as low bits
+    mask = _mm_and_si128(mask, maskLOW);         //low 4 bits, need masking to avoid zero if MSB is set
+    hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask);         //uses low 4 bits anyway
+    return _mm_add_epi8(lowpopcnt, hipopcnt);
+}
+#endif
+
+#if defined(USE_SSSE3)
+int8x16_t vcntq_s8(int8x16_t a);         // VCNT.8 q0,q0
+#define vcntq_s8 vcntq_u8
+
+poly8x16_t vcntq_p8(poly8x16_t a);         // VCNT.8 q0,q0
+#define vcntq_p8 vcntq_u8
+#endif
+
+//**************************************************************************************
+//*********************** Logical operations ****************************************
+//**************************************************************************************
+//************************** Bitwise not ***********************************
+//several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
+
+int8x16_t vmvnq_s8(int8x16_t a);         // VMVN q0,q0
+_NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a)         // VMVN q0,q0
+{
+    __m128i c1;
+    c1 = _mm_cmpeq_epi8 (a,a);         //0xff
+    return _mm_andnot_si128 (a, c1);
+}
+
+int16x8_t vmvnq_s16(int16x8_t a);         // VMVN q0,q0
+_NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a)         // VMVN q0,q0
+{
+    __m128i c1;
+    c1 = _mm_cmpeq_epi16 (a,a);         //0xffff
+    return _mm_andnot_si128 (a, c1);
+}
+
+int32x4_t vmvnq_s32(int32x4_t a);         // VMVN q0,q0
+_NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a)         // VMVN q0,q0
+{
+    __m128i c1;
+    c1 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
+    return _mm_andnot_si128 (a, c1);
+}
+
+uint8x16_t vmvnq_u8(uint8x16_t a);         // VMVN q0,q0
+#define vmvnq_u8 vmvnq_s8
+
+uint16x8_t vmvnq_u16(uint16x8_t a);         // VMVN q0,q0
+#define vmvnq_u16 vmvnq_s16
+
+uint32x4_t vmvnq_u32(uint32x4_t a);         // VMVN q0,q0
+#define vmvnq_u32 vmvnq_s32
+
+poly8x16_t vmvnq_p8(poly8x16_t a);         // VMVN q0,q0
+#define vmvnq_p8 vmvnq_u8
+
+//****************** Bitwise and ***********************
+//******************************************************
+
+int8x16_t   vandq_s8(int8x16_t a, int8x16_t b);         // VAND q0,q0,q0
+#define vandq_s8 _mm_and_si128
+
+int16x8_t   vandq_s16(int16x8_t a, int16x8_t b);         // VAND q0,q0,q0
+#define vandq_s16 _mm_and_si128
+
+int32x4_t   vandq_s32(int32x4_t a, int32x4_t b);         // VAND q0,q0,q0
+#define vandq_s32 _mm_and_si128
+
+int64x2_t   vandq_s64(int64x2_t a, int64x2_t b);         // VAND q0,q0,q0
+#define vandq_s64 _mm_and_si128
+
+uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b);         // VAND q0,q0,q0
+#define vandq_u8 _mm_and_si128
+
+uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b);         // VAND q0,q0,q0
+#define vandq_u16 _mm_and_si128
+
+uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b);         // VAND q0,q0,q0
+#define vandq_u32 _mm_and_si128
+
+uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b);         // VAND q0,q0,q0
+#define vandq_u64 _mm_and_si128
+
+//******************** Bitwise or *********************************
+//******************************************************************
+
+int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b);         // VORR q0,q0,q0
+#define vorrq_s8 _mm_or_si128
+
+int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b);         // VORR q0,q0,q0
+#define vorrq_s16 _mm_or_si128
+
+int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b);         // VORR q0,q0,q0
+#define vorrq_s32 _mm_or_si128
+
+int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b);         // VORR q0,q0,q0
+#define vorrq_s64 _mm_or_si128
+
+uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b);         // VORR q0,q0,q0
+#define vorrq_u8 _mm_or_si128
+
+uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b);         // VORR q0,q0,q0
+#define vorrq_u16 _mm_or_si128
+
+uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b);         // VORR q0,q0,q0
+#define vorrq_u32 _mm_or_si128
+
+uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b);         // VORR q0,q0,q0
+#define vorrq_u64 _mm_or_si128
+
+//************* Bitwise exclusive or (EOR or XOR) ******************
+//*******************************************************************
+
+int8x16_t   veorq_s8(int8x16_t a, int8x16_t b);         // VEOR q0,q0,q0
+#define veorq_s8 _mm_xor_si128
+
+int16x8_t   veorq_s16(int16x8_t a, int16x8_t b);         // VEOR q0,q0,q0
+#define veorq_s16 _mm_xor_si128
+
+int32x4_t   veorq_s32(int32x4_t a, int32x4_t b);         // VEOR q0,q0,q0
+#define veorq_s32 _mm_xor_si128
+
+int64x2_t   veorq_s64(int64x2_t a, int64x2_t b);         // VEOR q0,q0,q0
+#define veorq_s64 _mm_xor_si128
+
+uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b);         // VEOR q0,q0,q0
+#define veorq_u8 _mm_xor_si128
+
+uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b);         // VEOR q0,q0,q0
+#define veorq_u16 _mm_xor_si128
+
+uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b);         // VEOR q0,q0,q0
+#define veorq_u32 _mm_xor_si128
+
+uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b);         // VEOR q0,q0,q0
+#define veorq_u64 _mm_xor_si128
+
+//********************** Bit Clear **********************************
+//*******************************************************************
+//Logical AND complement (AND negation or AND NOT)
+
+//notice arguments "swap"
+
+//notice arguments "swap"
+
+//notice arguments "swap"
+
+//notice arguments "swap"
+
+//notice arguments "swap"
+
+//notice arguments "swap"
+
+//notice arguments "swap"
+
+//notice arguments "swap"
+
+int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b);         // VBIC q0,q0,q0
+#define vbicq_s8(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+
+int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b);         // VBIC q0,q0,q0
+#define vbicq_s16(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+
+int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b);         // VBIC q0,q0,q0
+#define vbicq_s32(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+
+int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b);         // VBIC q0,q0,q0
+#define vbicq_s64(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+
+uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b);         // VBIC q0,q0,q0
+#define vbicq_u8(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+
+uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b);         // VBIC q0,q0,q0
+#define vbicq_u16(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+
+uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b);         // VBIC q0,q0,q0
+#define vbicq_u32(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+
+uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b);         // VBIC q0,q0,q0
+#define vbicq_u64(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
+
+//**************** Bitwise OR complement ********************************
+//**************************************** ********************************
+//no exact IA 32 match, need to implement it as following
+
+int8x16_t vornq_s8(int8x16_t a, int8x16_t b);         // VORN q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b)         // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_s8( b);         //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+int16x8_t vornq_s16(int16x8_t a, int16x8_t b);         // VORN q0,q0,q0
+_NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b)         // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_s16( b);         //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+int32x4_t vornq_s32(int32x4_t a, int32x4_t b);         // VORN q0,q0,q0
+_NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b)         // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_s32( b);         //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+int64x2_t vornq_s64(int64x2_t a, int64x2_t b);         // VORN q0,q0,q0
+_NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
+{
+    __m128i c1, b1;
+    c1 = _mm_cmpeq_epi8 (a, a);         //all ones 0xfffffff...fffff
+    b1 = _mm_andnot_si128 (b, c1);
+    return _mm_or_si128 (a, b1);
+}
+
+uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b);         // VORN q0,q0,q0
+_NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b)         // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_u8( b);         //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b);         // VORN q0,q0,q0
+_NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b)         // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_s16( b);         //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+
+uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b);         // VORN q0,q0,q0
+_NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b)         // VORN q0,q0,q0
+{
+    __m128i b1;
+    b1 = vmvnq_u32( b);         //bitwise not for b
+    return _mm_or_si128 (a, b1);
+}
+uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
+#define vornq_u64 vornq_s64
+
+//********************* Bitwise Select *****************************
+//******************************************************************
+//Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
+
+//VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
+//corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
+
+//VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
+//if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
+
+//VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
+//if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
+
+//VBSL only is implemented for SIMD
+
+int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c);         // VBSL q0,q0,q0
+_NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c)         // VBSL q0,q0,q0
+{
+    __m128i sel1, sel2;
+    sel1 = _mm_and_si128   (a, b);
+    sel2 = _mm_andnot_si128 (a, c);
+    return _mm_or_si128 (sel1, sel2);
+}
+
+int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c);         // VBSL q0,q0,q0
+#define vbslq_s16 vbslq_s8
+
+int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c);         // VBSL q0,q0,q0
+#define vbslq_s32 vbslq_s8
+
+int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c);         // VBSL q0,q0,q0
+#define vbslq_s64 vbslq_s8
+
+uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VBSL q0,q0,q0
+#define vbslq_u8 vbslq_s8
+
+uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VBSL q0,q0,q0
+#define vbslq_u16 vbslq_s8
+
+uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VBSL q0,q0,q0
+#define vbslq_u32 vbslq_s8
+
+uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);         // VBSL q0,q0,q0
+#define vbslq_u64 vbslq_s8
+
+float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c);         // VBSL q0,q0,q0
+_NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c)         // VBSL q0,q0,q0
+{
+    __m128 sel1, sel2;
+    sel1 = _mm_and_ps   (*(__m128*)&a, b);
+    sel2 = _mm_andnot_ps (*(__m128*)&a, c);
+    return _mm_or_ps (sel1, sel2);
+}
+
+poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c);         // VBSL q0,q0,q0
+#define vbslq_p8 vbslq_u8
+
+poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL q0,q0,q0
+#define vbslq_p16 vbslq_s8
+
+//************************************************************************************
+//**************** Transposition operations ****************************************
+//************************************************************************************
+//*****************  Vector Transpose ************************************************
+//************************************************************************************
+//VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
+// making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
+
+#if defined(USE_SSSE3)
+//int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b)         // VTRN.8 q0,q0
+{
+    int8x16x2_t r8x16;
+    __m128i a_sh, b_sh;
+    _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd);         //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd);         //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
+
+    r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh);         //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
+    r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh);         // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
+    return r8x16;
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b);         // VTRN.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b)         // VTRN.16 q0,q0
+{
+    int16x8x2_t v16x8;
+    __m128i a_sh, b_sh;
+    _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd);         //a0, a2, a4, a6,  a1, a3, a5, a7
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd);         //b0, b2, b4, b6,  b1, b3, b5, b7
+    v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh);         //a0, b0, a2, b2, a4, b4, a6, b6
+    v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh);         //a1, b1, a3, b3, a5, b5, a7, b7
+    return v16x8;
+}
+#endif
+
+int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b);         // VTRN.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b)         // VTRN.32 q0,q0
+{         //may be not optimal solution compared with serial
+    int32x4x2_t v32x4;
+    __m128i a_sh, b_sh;
+    a_sh = _mm_shuffle_epi32 (a, 216);         //a0, a2, a1, a3
+    b_sh = _mm_shuffle_epi32 (b, 216);         //b0, b2, b1, b3
+
+    v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh);         //a0, b0, a2, b2
+    v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh);         //a1, b1, a3,  b3
+    return v32x4;
+}
+
+#if defined(USE_SSSE3)
+uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b);         // VTRN.8 q0,q0
+#define vtrnq_u8 vtrnq_s8
+
+uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b);         // VTRN.16 q0,q0
+#define vtrnq_u16 vtrnq_s16
+#endif
+
+uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b);         // VTRN.32 q0,q0
+#define vtrnq_u32 vtrnq_s32
+
+float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b);         // VTRN.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b)         // VTRN.32 q0,q0
+{         //may be not optimal solution compared with serial
+    float32x4x2_t f32x4;
+    __m128 a_sh, b_sh;
+    a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0));         //a0, a2, a1, a3, need to check endiness
+    b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0));         //b0, b2, b1, b3, need to check endiness
+
+    f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh);         //a0, b0, a2, b2
+    f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh);         //a1, b1, a3,  b3
+    return f32x4;
+}
+
+#if defined(USE_SSSE3)
+poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b);         // VTRN.8 q0,q0
+#define vtrnq_p8 vtrnq_s8
+
+poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b);         // VTRN.16 q0,q0
+#define vtrnq_p16 vtrnq_s16
+#endif
+
+//***************** Interleave elements ***************************
+//*****************************************************************
+//output has (a0,b0,a1,b1, a2,b2,.....)
+
+int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b);         // VZIP.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b)         // VZIP.8 q0,q0
+{
+    int8x16x2_t r8x16;
+    r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
+    r8x16.val[1] =  _mm_unpackhi_epi8(a, b);
+    return r8x16;
+}
+
+int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b);         // VZIP.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b)         // VZIP.16 q0,q0
+{
+    int16x8x2_t r16x8;
+    r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
+    r16x8.val[1] =  _mm_unpackhi_epi16(a, b);
+    return r16x8;
+}
+
+int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b);         // VZIP.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b)         // VZIP.32 q0,q0
+{
+    int32x4x2_t r32x4;
+    r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
+    r32x4.val[1] =  _mm_unpackhi_epi32(a, b);
+    return r32x4;
+}
+
+uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b);         // VZIP.8 q0,q0
+#define vzipq_u8 vzipq_s8
+
+uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b);         // VZIP.16 q0,q0
+#define vzipq_u16 vzipq_s16
+
+uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b);         // VZIP.32 q0,q0
+#define vzipq_u32 vzipq_s32
+
+float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b);         // VZIP.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b)         // VZIP.32 q0,q0
+{
+    float32x4x2_t f32x4;
+    f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
+    f32x4.val[1] =   _mm_unpackhi_ps ( a,  b);
+    return f32x4;
+}
+
+poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b);         // VZIP.8 q0,q0
+#define vzipq_p8 vzipq_u8
+
+poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b);         // VZIP.16 q0,q0
+#define vzipq_p16 vzipq_u16
+
+//*********************** De-Interleave elements *************************
+//*************************************************************************
+//As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
+//no such functions in IA32 SIMD, shuffle is required
+
+#if defined(USE_SSSE3)
+int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b);         // VUZP.8 q0,q0
+_NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b)         // VUZP.8 q0,q0
+{
+    int8x16x2_t v8x16;
+    __m128i a_sh, b_sh;
+    _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd);         //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd);         //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
+    //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
+    v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
+    v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
+    return v8x16;
+}
+#endif
+
+#if defined(USE_SSSE3)
+int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b);         // VUZP.16 q0,q0
+_NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b)         // VUZP.16 q0,q0
+{
+    int16x8x2_t v16x8;
+    __m128i a_sh, b_sh;
+    _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
+    a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd);         //a0, a2, a4, a6,  a1, a3, a5, a7
+    b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd);         //b0, b2, b4, b6,  b1, b3, b5, b7
+    v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         //a0, a2, a4, a6, b0, b2, b4, b6
+    v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, a5, a7, b1, b3, b5, b7
+    return v16x8;
+}
+#endif
+
+int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b);         // VUZP.32 q0,q0
+_NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b)         // VUZP.32 q0,q0
+{         //may be not optimal solution compared with serial
+    int32x4x2_t v32x4;
+    __m128i a_sh, b_sh;
+    a_sh = _mm_shuffle_epi32 (a, 216);         //a0, a2, a1, a3
+    b_sh = _mm_shuffle_epi32 (b, 216);         //b0, b2, b1, b3
+
+    v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         //a0, a2, b0, b2
+    v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, b1, b3
+    return v32x4;
+}
+
+#if defined(USE_SSSE3)
+uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b);         // VUZP.8 q0,q0
+#define vuzpq_u8 vuzpq_s8
+
+uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b);         // VUZP.16 q0,q0
+#define vuzpq_u16 vuzpq_s16
+#endif
+
+uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b);         // VUZP.32 q0,q0
+#define vuzpq_u32 vuzpq_s32
+
+float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b);         // VUZP.32 q0,q0
+_NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b)         // VUZP.32 q0,q0
+{
+    float32x4x2_t v32x4;
+    v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0));         //a0, a2, b0, b2 , need to check endianess however
+    v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1));         //a1, a3, b1, b3, need to check endianess however
+    return v32x4;
+}
+
+#if defined(USE_SSSE3)
+poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b);         // VUZP.8 q0,q0
+#define vuzpq_p8 vuzpq_u8
+
+poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
+#define vuzpq_p16 vuzpq_u16
+#endif
+
+//##############################################################################################
+//*********************** Reinterpret cast intrinsics.******************************************
+//##############################################################################################
+// Not a part of oficial NEON instruction set but available in gcc compiler *********************
+
+poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
+#define vreinterpretq_p8_u32
+
+poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
+#define vreinterpretq_p8_u16
+
+poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
+#define vreinterpretq_p8_u8
+
+poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
+#define vreinterpretq_p8_s32
+
+poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
+#define vreinterpretq_p8_s16
+
+poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
+#define vreinterpretq_p8_s8
+
+poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
+#define vreinterpretq_p8_u64
+
+poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
+#define vreinterpretq_p8_s64
+
+poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
+#define vreinterpretq_p8_f32(t) _M128i(t)
+
+poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
+#define vreinterpretq_p8_p16
+
+poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
+#define vreinterpretq_p16_u32
+
+poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
+#define vreinterpretq_p16_u16
+
+poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
+#define vreinterpretq_p16_s32
+
+poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
+#define vreinterpretq_p16_s16
+
+poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
+#define vreinterpretq_p16_s8
+
+poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
+#define vreinterpretq_p16_u64
+
+poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
+#define vreinterpretq_p16_s64
+
+poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
+#define vreinterpretq_p16_f32(t) _M128i(t)
+
+poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
+#define vreinterpretq_p16_p8  vreinterpretq_s16_p8
+
+//****  Integer to float  ******
+
+float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
+#define  vreinterpretq_f32_u32(t) *(__m128*)&(t)
+
+float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
+#define vreinterpretq_f32_u16 vreinterpretq_f32_u32
+
+float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
+#define vreinterpretq_f32_u8 vreinterpretq_f32_u32
+
+float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
+#define vreinterpretq_f32_s32 vreinterpretq_f32_u32
+
+float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
+#define vreinterpretq_f32_s16 vreinterpretq_f32_u32
+
+float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
+#define vreinterpretq_f32_s8 vreinterpretq_f32_u32
+
+float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
+#define vreinterpretq_f32_u64 vreinterpretq_f32_u32
+
+float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
+#define vreinterpretq_f32_s64 vreinterpretq_f32_u32
+
+float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
+#define vreinterpretq_f32_p16 vreinterpretq_f32_u32
+
+float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
+#define vreinterpretq_f32_p8 vreinterpretq_f32_u32
+
+//*** Integer type conversions ******************
+//no conversion necessary for the following functions because it is same data type
+
+int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
+#define vreinterpretq_s64_u32
+
+int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
+#define vreinterpretq_s64_s16
+
+int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
+#define vreinterpretq_s64_u8
+
+int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
+#define vreinterpretq_s64_s32
+
+int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
+#define vreinterpretq_s64_u16
+
+int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
+#define vreinterpretq_s64_s8
+
+int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
+#define vreinterpretq_s64_u64
+
+int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
+#define vreinterpretq_s64_f32(t) _M128i(t)
+
+int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
+#define vreinterpretq_s64_p16
+
+int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
+#define vreinterpretq_s64_p8
+
+uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
+#define vreinterpretq_u64_u32
+
+uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
+#define vreinterpretq_u64_u16
+
+uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
+#define vreinterpretq_u64_u8
+
+uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
+#define vreinterpretq_u64_s32
+
+uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
+#define vreinterpretq_u64_s16
+
+uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
+#define vreinterpretq_u64_s8
+
+uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
+#define vreinterpretq_u64_s64
+
+uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
+#define vreinterpretq_u64_f32(t) _M128i(t)
+
+uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
+#define vreinterpretq_u64_p16
+
+uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
+#define vreinterpretq_u64_p8
+
+int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
+#define vreinterpretq_s8_u32
+
+int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
+#define vreinterpretq_s8_u16
+
+int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
+#define vreinterpretq_s8_u8
+
+int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
+#define vreinterpretq_s8_s32
+
+int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
+#define vreinterpretq_s8_s16
+
+int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
+#define vreinterpretq_s8_u64
+
+int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
+#define vreinterpretq_s8_s64
+
+int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
+#define vreinterpretq_s8_f32(t) _M128i(t)
+
+int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
+#define vreinterpretq_s8_p16
+
+int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
+#define vreinterpretq_s8_p8
+
+int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
+#define vreinterpretq_s16_u32
+
+int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
+#define vreinterpretq_s16_u16
+
+int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
+#define vreinterpretq_s16_u8
+
+int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
+#define vreinterpretq_s16_s32
+
+int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
+#define vreinterpretq_s16_s8
+
+int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
+#define vreinterpretq_s16_u64
+
+int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
+#define vreinterpretq_s16_s64
+
+int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
+#define vreinterpretq_s16_f32(t) _M128i(t)
+
+int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
+#define vreinterpretq_s16_p16
+
+int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
+#define vreinterpretq_s16_p8
+
+int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
+#define vreinterpretq_s32_u32
+
+int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
+#define vreinterpretq_s32_u16
+
+int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
+#define vreinterpretq_s32_u8
+
+int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
+#define vreinterpretq_s32_s16
+
+int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
+#define vreinterpretq_s32_s8
+
+int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
+#define vreinterpretq_s32_u64
+
+int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
+#define vreinterpretq_s32_s64
+
+int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
+#define vreinterpretq_s32_f32(t)  _mm_castps_si128(t)         //(*(__m128i*)&(t))
+
+int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
+#define vreinterpretq_s32_p16
+
+int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
+#define vreinterpretq_s32_p8
+
+uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
+#define vreinterpretq_u8_u32
+
+uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
+#define vreinterpretq_u8_u16
+
+uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
+#define vreinterpretq_u8_s32
+
+uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
+#define vreinterpretq_u8_s16
+
+uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
+#define vreinterpretq_u8_s8
+
+uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
+#define vreinterpretq_u8_u64
+
+uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
+#define vreinterpretq_u8_s64
+
+uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
+#define vreinterpretq_u8_f32(t) _M128i(t)
+
+uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
+#define vreinterpretq_u8_p16
+
+uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
+#define vreinterpretq_u8_p8
+
+uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
+#define vreinterpretq_u16_u32
+
+uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
+#define vreinterpretq_u16_u8
+
+uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
+#define vreinterpretq_u16_s32
+
+uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
+#define vreinterpretq_u16_s16
+
+uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
+#define vreinterpretq_u16_s8
+
+uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
+#define vreinterpretq_u16_u64
+
+uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
+#define vreinterpretq_u16_s64
+
+uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
+#define vreinterpretq_u16_f32(t) _M128i(t)
+
+uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
+#define vreinterpretq_u16_p16
+
+uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
+#define vreinterpretq_u16_p8
+
+uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
+#define vreinterpretq_u32_u16
+
+uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
+#define vreinterpretq_u32_u8
+
+uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
+#define vreinterpretq_u32_s32
+
+uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
+#define vreinterpretq_u32_s16
+
+uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
+#define vreinterpretq_u32_s8
+
+uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
+#define vreinterpretq_u32_u64
+
+uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
+#define vreinterpretq_u32_s64
+
+uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
+#define  vreinterpretq_u32_f32(t) _M128i(t)
+
+uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
+#define vreinterpretq_u32_p16
+
+uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
+#define vreinterpretq_u32_p8
+
+#endif /* NEON2SSE_H */
-- 
cgit v1.2.3