; ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. ; ; Use of this source code is governed by a BSD-style license ; that can be found in the LICENSE file in the root of the source ; tree. An additional intellectual property rights grant can be found ; in the file PATENTS. All contributing project authors may ; be found in the AUTHORS file in the root of the source tree. ; %include "vpx_ports/x86_abi_support.asm" %macro LF_ABS 2 ; %1 value not preserved ; %2 value preserved ; output in %1 movdqa scratch1, %2 ; v2 psubusb scratch1, %1 ; v2 - v1 psubusb %1, %2 ; v1 - v2 por %1, scratch1 ; abs(v2 - v1) %endmacro %macro LF_FILTER_HEV_MASK 8-9 LF_ABS %1, %2 ; abs(p3 - p2) LF_ABS %2, %3 ; abs(p2 - p1) pmaxub %1, %2 ; accumulate mask %if %0 == 8 movdqa scratch2, %3 ; save p1 LF_ABS scratch2, %4 ; abs(p1 - p0) %endif LF_ABS %4, %5 ; abs(p0 - q0) LF_ABS %5, %6 ; abs(q0 - q1) %if %0 == 8 pmaxub %5, scratch2 ; accumulate hev %else pmaxub %5, %9 %endif pmaxub %1, %5 ; accumulate mask LF_ABS %3, %6 ; abs(p1 - q1) LF_ABS %6, %7 ; abs(q1 - q2) pmaxub %1, %6 ; accumulate mask LF_ABS %7, %8 ; abs(q2 - q3) pmaxub %1, %7 ; accumulate mask paddusb %4, %4 ; 2 * abs(p0 - q0) pand %3, [GLOBAL(tfe)] psrlw %3, 1 ; abs(p1 - q1) / 2 paddusb %4, %3 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 psubusb %1, [limit] psubusb %4, [blimit] por %1, %4 pcmpeqb %1, zero ; mask psubusb %5, [thresh] pcmpeqb %5, zero ; ~hev %endmacro %macro LF_FILTER 6 ; %1-%4: p1-q1 ; %5: mask ; %6: hev movdqa scratch2, %6 ; save hev pxor %1, [GLOBAL(t80)] ; ps1 pxor %4, [GLOBAL(t80)] ; qs1 movdqa scratch1, %1 psubsb scratch1, %4 ; signed_char_clamp(ps1 - qs1) pandn scratch2, scratch1 ; vp8_filter &= hev pxor %2, [GLOBAL(t80)] ; ps0 pxor %3, [GLOBAL(t80)] ; qs0 movdqa scratch1, %3 psubsb scratch1, %2 ; qs0 - ps0 paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) paddsb scratch2, scratch1 ; vp8_filter += (qs0 - ps0) pand %5, scratch2 ; &= mask movdqa scratch2, %5 paddsb %5, [GLOBAL(t4)] ; Filter1 paddsb scratch2, [GLOBAL(t3)] ; Filter2 ; Filter1 >> 3 movdqa scratch1, zero pcmpgtb scratch1, %5 psrlw %5, 3 pand scratch1, [GLOBAL(te0)] pand %5, [GLOBAL(t1f)] por %5, scratch1 psubsb %3, %5 ; qs0 - Filter1 pxor %3, [GLOBAL(t80)] ; Filter2 >> 3 movdqa scratch1, zero pcmpgtb scratch1, scratch2 psrlw scratch2, 3 pand scratch1, [GLOBAL(te0)] pand scratch2, [GLOBAL(t1f)] por scratch2, scratch1 paddsb %2, scratch2 ; ps0 + Filter2 pxor %2, [GLOBAL(t80)] ; outer tap adjustments paddsb %5, [GLOBAL(t1)] movdqa scratch1, zero pcmpgtb scratch1, %5 psrlw %5, 1 pand scratch1, [GLOBAL(t80)] pand %5, [GLOBAL(t7f)] por %5, scratch1 pand %5, %6 ; vp8_filter &= ~hev psubsb %4, %5 ; qs1 - vp8_filter pxor %4, [GLOBAL(t80)] paddsb %1, %5 ; ps1 + vp8_filter pxor %1, [GLOBAL(t80)] %endmacro ;void vp8_loop_filter_bh_y_sse2 ;( ; unsigned char *src_ptr, ; int src_pixel_step, ; const char *blimit, ; const char *limit, ; const char *thresh ;) global sym(vp8_loop_filter_bh_y_sse2) PRIVATE sym(vp8_loop_filter_bh_y_sse2): %ifidn __OUTPUT_FORMAT__,x64 %define src rcx ; src_ptr %define stride rdx ; src_pixel_step %define blimit r8 %define limit r9 %define thresh r10 %define spp rax %define stride3 r11 %define stride5 r12 %define stride7 r13 push rbp mov rbp, rsp push r12 push r13 mov thresh, arg(4) %else %define src rdi ; src_ptr %define stride rsi ; src_pixel_step %define blimit rdx %define limit rcx %define thresh r8 %define spp rax %define stride3 r9 %define stride5 r10 %define stride7 r11 %endif %define scratch1 xmm5 %define scratch2 xmm6 %define zero xmm7 %define i0 [src] %define i1 [spp] %define i2 [src + 2 * stride] %define i3 [spp + 2 * stride] %define i4 [src + 4 * stride] %define i5 [spp + 4 * stride] %define i6 [src + 2 * stride3] %define i7 [spp + 2 * stride3] %define i8 [src + 8 * stride] %define i9 [spp + 8 * stride] %define i10 [src + 2 * stride5] %define i11 [spp + 2 * stride5] %define i12 [src + 4 * stride3] %define i13 [spp + 4 * stride3] %define i14 [src + 2 * stride7] %define i15 [spp + 2 * stride7] ; prep work lea spp, [src + stride] lea stride3, [stride + 2 * stride] lea stride5, [stride3 + 2 * stride] lea stride7, [stride3 + 4 * stride] pxor zero, zero ; load the first set into registers movdqa xmm0, i0 movdqa xmm1, i1 movdqa xmm2, i2 movdqa xmm3, i3 movdqa xmm4, i4 movdqa xmm8, i5 movdqa xmm9, i6 ; q2, will contain abs(p1-p0) movdqa xmm10, i7 LF_FILTER_HEV_MASK xmm0, xmm1, xmm2, xmm3, xmm4, xmm8, xmm9, xmm10 movdqa xmm1, i2 movdqa xmm2, i3 movdqa xmm3, i4 movdqa xmm8, i5 LF_FILTER xmm1, xmm2, xmm3, xmm8, xmm0, xmm4 movdqa i2, xmm1 movdqa i3, xmm2 ; second set movdqa i4, xmm3 movdqa i5, xmm8 movdqa xmm0, i6 movdqa xmm1, i7 movdqa xmm2, i8 movdqa xmm4, i9 movdqa xmm10, i10 ; q2, will contain abs(p1-p0) movdqa xmm11, i11 LF_FILTER_HEV_MASK xmm3, xmm8, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm9 movdqa xmm0, i6 movdqa xmm1, i7 movdqa xmm4, i8 movdqa xmm8, i9 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 movdqa i6, xmm0 movdqa i7, xmm1 ; last set movdqa i8, xmm4 movdqa i9, xmm8 movdqa xmm0, i10 movdqa xmm1, i11 movdqa xmm2, i12 movdqa xmm3, i13 movdqa xmm9, i14 ; q2, will contain abs(p1-p0) movdqa xmm11, i15 LF_FILTER_HEV_MASK xmm4, xmm8, xmm0, xmm1, xmm2, xmm3, xmm9, xmm11, xmm10 movdqa xmm0, i10 movdqa xmm1, i11 movdqa xmm3, i12 movdqa xmm8, i13 LF_FILTER xmm0, xmm1, xmm3, xmm8, xmm4, xmm2 movdqa i10, xmm0 movdqa i11, xmm1 movdqa i12, xmm3 movdqa i13, xmm8 %ifidn __OUTPUT_FORMAT__,x64 pop r13 pop r12 pop rbp %endif ret ;void vp8_loop_filter_bv_y_sse2 ;( ; unsigned char *src_ptr, ; int src_pixel_step, ; const char *blimit, ; const char *limit, ; const char *thresh ;) global sym(vp8_loop_filter_bv_y_sse2) PRIVATE sym(vp8_loop_filter_bv_y_sse2): %ifidn __OUTPUT_FORMAT__,x64 %define src rcx ; src_ptr %define stride rdx ; src_pixel_step %define blimit r8 %define limit r9 %define thresh r10 %define spp rax %define stride3 r11 %define stride5 r12 %define stride7 r13 push rbp mov rbp, rsp SAVE_XMM 15 push r12 push r13 mov thresh, arg(4) %else %define src rdi %define stride rsi %define blimit rdx %define limit rcx %define thresh r8 %define spp rax %define stride3 r9 %define stride5 r10 %define stride7 r11 %endif %define scratch1 xmm5 %define scratch2 xmm6 %define zero xmm7 %define s0 [src] %define s1 [spp] %define s2 [src + 2 * stride] %define s3 [spp + 2 * stride] %define s4 [src + 4 * stride] %define s5 [spp + 4 * stride] %define s6 [src + 2 * stride3] %define s7 [spp + 2 * stride3] %define s8 [src + 8 * stride] %define s9 [spp + 8 * stride] %define s10 [src + 2 * stride5] %define s11 [spp + 2 * stride5] %define s12 [src + 4 * stride3] %define s13 [spp + 4 * stride3] %define s14 [src + 2 * stride7] %define s15 [spp + 2 * stride7] %define i0 [rsp] %define i1 [rsp + 16] %define i2 [rsp + 32] %define i3 [rsp + 48] %define i4 [rsp + 64] %define i5 [rsp + 80] %define i6 [rsp + 96] %define i7 [rsp + 112] %define i8 [rsp + 128] %define i9 [rsp + 144] %define i10 [rsp + 160] %define i11 [rsp + 176] %define i12 [rsp + 192] %define i13 [rsp + 208] %define i14 [rsp + 224] %define i15 [rsp + 240] ALIGN_STACK 16, rax ; reserve stack space %define temp_storage 0 ; size is 256 (16*16) %define stack_size 256 sub rsp, stack_size ; prep work lea spp, [src + stride] lea stride3, [stride + 2 * stride] lea stride5, [stride3 + 2 * stride] lea stride7, [stride3 + 4 * stride] ; 8-f movdqa xmm0, s8 movdqa xmm1, xmm0 punpcklbw xmm0, s9 ; 80 90 punpckhbw xmm1, s9 ; 88 98 movdqa xmm2, s10 movdqa xmm3, xmm2 punpcklbw xmm2, s11 ; a0 b0 punpckhbw xmm3, s11 ; a8 b8 movdqa xmm4, xmm0 punpcklwd xmm0, xmm2 ; 80 90 a0 b0 punpckhwd xmm4, xmm2 ; 84 94 a4 b4 movdqa xmm2, xmm1 punpcklwd xmm1, xmm3 ; 88 98 a8 b8 punpckhwd xmm2, xmm3 ; 8c 9c ac bc ; using xmm[0124] ; work on next 4 rows movdqa xmm3, s12 movdqa xmm5, xmm3 punpcklbw xmm3, s13 ; c0 d0 punpckhbw xmm5, s13 ; c8 d8 movdqa xmm6, s14 movdqa xmm7, xmm6 punpcklbw xmm6, s15 ; e0 f0 punpckhbw xmm7, s15 ; e8 f8 movdqa xmm8, xmm3 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 movdqa xmm6, xmm5 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 punpckhwd xmm6, xmm7 ; cc dc ec fc ; pull the third and fourth sets together movdqa xmm7, xmm0 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 movdqa xmm3, xmm4 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 movdqa xmm8, xmm1 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa movdqa xmm5, xmm2 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe ; save the calculations. we only have 15 registers ... movdqa i0, xmm0 movdqa i1, xmm7 movdqa i2, xmm4 movdqa i3, xmm3 movdqa i4, xmm1 movdqa i5, xmm8 movdqa i6, xmm2 movdqa i7, xmm5 ; 0-7 movdqa xmm0, s0 movdqa xmm1, xmm0 punpcklbw xmm0, s1 ; 00 10 punpckhbw xmm1, s1 ; 08 18 movdqa xmm2, s2 movdqa xmm3, xmm2 punpcklbw xmm2, s3 ; 20 30 punpckhbw xmm3, s3 ; 28 38 movdqa xmm4, xmm0 punpcklwd xmm0, xmm2 ; 00 10 20 30 punpckhwd xmm4, xmm2 ; 04 14 24 34 movdqa xmm2, xmm1 punpcklwd xmm1, xmm3 ; 08 18 28 38 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c ; using xmm[0124] ; work on next 4 rows movdqa xmm3, s4 movdqa xmm5, xmm3 punpcklbw xmm3, s5 ; 40 50 punpckhbw xmm5, s5 ; 48 58 movdqa xmm6, s6 movdqa xmm7, xmm6 punpcklbw xmm6, s7 ; 60 70 punpckhbw xmm7, s7 ; 68 78 movdqa xmm8, xmm3 punpcklwd xmm3, xmm6 ; 40 50 60 70 punpckhwd xmm8, xmm6 ; 44 54 64 74 movdqa xmm6, xmm5 punpcklwd xmm5, xmm7 ; 48 58 68 78 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c ; pull the first two sets together movdqa xmm7, xmm0 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 movdqa xmm3, xmm4 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 movdqa xmm8, xmm1 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a movdqa xmm5, xmm2 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e ; final combination movdqa xmm6, xmm0 punpcklqdq xmm0, i0 punpckhqdq xmm6, i0 movdqa xmm9, xmm7 punpcklqdq xmm7, i1 punpckhqdq xmm9, i1 movdqa xmm10, xmm4 punpcklqdq xmm4, i2 punpckhqdq xmm10, i2 movdqa xmm11, xmm3 punpcklqdq xmm3, i3 punpckhqdq xmm11, i3 movdqa xmm12, xmm1 punpcklqdq xmm1, i4 punpckhqdq xmm12, i4 movdqa xmm13, xmm8 punpcklqdq xmm8, i5 punpckhqdq xmm13, i5 movdqa xmm14, xmm2 punpcklqdq xmm2, i6 punpckhqdq xmm14, i6 movdqa xmm15, xmm5 punpcklqdq xmm5, i7 punpckhqdq xmm15, i7 movdqa i0, xmm0 movdqa i1, xmm6 movdqa i2, xmm7 movdqa i3, xmm9 movdqa i4, xmm4 movdqa i5, xmm10 movdqa i6, xmm3 movdqa i7, xmm11 movdqa i8, xmm1 movdqa i9, xmm12 movdqa i10, xmm8 movdqa i11, xmm13 movdqa i12, xmm2 movdqa i13, xmm14 movdqa i14, xmm5 movdqa i15, xmm15 ; TRANSPOSED DATA AVAILABLE ON THE STACK movdqa xmm12, xmm6 movdqa xmm13, xmm7 pxor zero, zero LF_FILTER_HEV_MASK xmm0, xmm12, xmm13, xmm9, xmm4, xmm10, xmm3, xmm11 movdqa xmm1, i2 movdqa xmm2, i3 movdqa xmm8, i4 movdqa xmm9, i5 LF_FILTER xmm1, xmm2, xmm8, xmm9, xmm0, xmm4 movdqa i2, xmm1 movdqa i3, xmm2 ; second set movdqa i4, xmm8 movdqa i5, xmm9 movdqa xmm0, i6 movdqa xmm1, i7 movdqa xmm2, i8 movdqa xmm4, i9 movdqa xmm10, i10 ; q2, will contain abs(p1-p0) movdqa xmm11, i11 LF_FILTER_HEV_MASK xmm8, xmm9, xmm0, xmm1, xmm2, xmm4, xmm10, xmm11, xmm3 movdqa xmm0, i6 movdqa xmm1, i7 movdqa xmm3, i8 movdqa xmm4, i9 LF_FILTER xmm0, xmm1, xmm3, xmm4, xmm8, xmm2 movdqa i6, xmm0 movdqa i7, xmm1 ; last set movdqa i8, xmm3 movdqa i9, xmm4 movdqa xmm0, i10 movdqa xmm1, i11 movdqa xmm2, i12 movdqa xmm8, i13 movdqa xmm9, i14 ; q2, will contain abs(p1-p0) movdqa xmm11, i15 LF_FILTER_HEV_MASK xmm3, xmm4, xmm0, xmm1, xmm2, xmm8, xmm9, xmm11, xmm10 movdqa xmm0, i10 movdqa xmm1, i11 movdqa xmm4, i12 movdqa xmm8, i13 LF_FILTER xmm0, xmm1, xmm4, xmm8, xmm3, xmm2 movdqa i10, xmm0 movdqa i11, xmm1 movdqa i12, xmm4 movdqa i13, xmm8 ; RESHUFFLE AND WRITE OUT ; 8-f movdqa xmm0, i8 movdqa xmm1, xmm0 punpcklbw xmm0, i9 ; 80 90 punpckhbw xmm1, i9 ; 88 98 movdqa xmm2, i10 movdqa xmm3, xmm2 punpcklbw xmm2, i11 ; a0 b0 punpckhbw xmm3, i11 ; a8 b8 movdqa xmm4, xmm0 punpcklwd xmm0, xmm2 ; 80 90 a0 b0 punpckhwd xmm4, xmm2 ; 84 94 a4 b4 movdqa xmm2, xmm1 punpcklwd xmm1, xmm3 ; 88 98 a8 b8 punpckhwd xmm2, xmm3 ; 8c 9c ac bc ; using xmm[0124] ; work on next 4 rows movdqa xmm3, i12 movdqa xmm5, xmm3 punpcklbw xmm3, i13 ; c0 d0 punpckhbw xmm5, i13 ; c8 d8 movdqa xmm6, i14 movdqa xmm7, xmm6 punpcklbw xmm6, i15 ; e0 f0 punpckhbw xmm7, i15 ; e8 f8 movdqa xmm8, xmm3 punpcklwd xmm3, xmm6 ; c0 d0 e0 f0 punpckhwd xmm8, xmm6 ; c4 d4 e4 f4 movdqa xmm6, xmm5 punpcklwd xmm5, xmm7 ; c8 d8 e8 f8 punpckhwd xmm6, xmm7 ; cc dc ec fc ; pull the third and fourth sets together movdqa xmm7, xmm0 punpckldq xmm0, xmm3 ; 80 90 a0 b0 c0 d0 e0 f0 punpckhdq xmm7, xmm3 ; 82 92 a2 b2 c2 d2 e2 f2 movdqa xmm3, xmm4 punpckldq xmm4, xmm8 ; 84 94 a4 b4 c4 d4 e4 f4 punpckhdq xmm3, xmm8 ; 86 96 a6 b6 c6 d6 e6 f6 movdqa xmm8, xmm1 punpckldq xmm1, xmm5 ; 88 88 a8 b8 c8 d8 e8 f8 punpckhdq xmm8, xmm5 ; 8a 9a aa ba ca da ea fa movdqa xmm5, xmm2 punpckldq xmm2, xmm6 ; 8c 9c ac bc cc dc ec fc punpckhdq xmm5, xmm6 ; 8e 9e ae be ce de ee fe ; save the calculations. we only have 15 registers ... movdqa i8, xmm0 movdqa i9, xmm7 movdqa i10, xmm4 movdqa i11, xmm3 movdqa i12, xmm1 movdqa i13, xmm8 movdqa i14, xmm2 movdqa i15, xmm5 ; 0-7 movdqa xmm0, i0 movdqa xmm1, xmm0 punpcklbw xmm0, i1 ; 00 10 punpckhbw xmm1, i1 ; 08 18 movdqa xmm2, i2 movdqa xmm3, xmm2 punpcklbw xmm2, i3 ; 20 30 punpckhbw xmm3, i3 ; 28 38 movdqa xmm4, xmm0 punpcklwd xmm0, xmm2 ; 00 10 20 30 punpckhwd xmm4, xmm2 ; 04 14 24 34 movdqa xmm2, xmm1 punpcklwd xmm1, xmm3 ; 08 18 28 38 punpckhwd xmm2, xmm3 ; 0c 1c 2c 3c ; using xmm[0124] ; work on next 4 rows movdqa xmm3, i4 movdqa xmm5, xmm3 punpcklbw xmm3, i5 ; 40 50 punpckhbw xmm5, i5 ; 48 58 movdqa xmm6, i6 movdqa xmm7, xmm6 punpcklbw xmm6, i7 ; 60 70 punpckhbw xmm7, i7 ; 68 78 movdqa xmm8, xmm3 punpcklwd xmm3, xmm6 ; 40 50 60 70 punpckhwd xmm8, xmm6 ; 44 54 64 74 movdqa xmm6, xmm5 punpcklwd xmm5, xmm7 ; 48 58 68 78 punpckhwd xmm6, xmm7 ; 4c 5c 6c 7c ; pull the first two sets together movdqa xmm7, xmm0 punpckldq xmm0, xmm3 ; 00 10 20 30 40 50 60 70 punpckhdq xmm7, xmm3 ; 02 12 22 32 42 52 62 72 movdqa xmm3, xmm4 punpckldq xmm4, xmm8 ; 04 14 24 34 44 54 64 74 punpckhdq xmm3, xmm8 ; 06 16 26 36 46 56 66 76 movdqa xmm8, xmm1 punpckldq xmm1, xmm5 ; 08 18 28 38 48 58 68 78 punpckhdq xmm8, xmm5 ; 0a 1a 2a 3a 4a 5a 6a 7a movdqa xmm5, xmm2 punpckldq xmm2, xmm6 ; 0c 1c 2c 3c 4c 5c 6c 7c punpckhdq xmm5, xmm6 ; 0e 1e 2e 3e 4e 5e 6e 7e ; final combination movdqa xmm6, xmm0 punpcklqdq xmm0, i8 punpckhqdq xmm6, i8 movdqa xmm9, xmm7 punpcklqdq xmm7, i9 punpckhqdq xmm9, i9 movdqa xmm10, xmm4 punpcklqdq xmm4, i10 punpckhqdq xmm10, i10 movdqa xmm11, xmm3 punpcklqdq xmm3, i11 punpckhqdq xmm11, i11 movdqa xmm12, xmm1 punpcklqdq xmm1, i12 punpckhqdq xmm12, i12 movdqa xmm13, xmm8 punpcklqdq xmm8, i13 punpckhqdq xmm13, i13 movdqa xmm14, xmm2 punpcklqdq xmm2, i14 punpckhqdq xmm14, i14 movdqa xmm15, xmm5 punpcklqdq xmm5, i15 punpckhqdq xmm15, i15 movdqa s0, xmm0 movdqa s1, xmm6 movdqa s2, xmm7 movdqa s3, xmm9 movdqa s4, xmm4 movdqa s5, xmm10 movdqa s6, xmm3 movdqa s7, xmm11 movdqa s8, xmm1 movdqa s9, xmm12 movdqa s10, xmm8 movdqa s11, xmm13 movdqa s12, xmm2 movdqa s13, xmm14 movdqa s14, xmm5 movdqa s15, xmm15 ; free stack space add rsp, stack_size ; un-ALIGN_STACK pop rsp %ifidn __OUTPUT_FORMAT__,x64 pop r13 pop r12 RESTORE_XMM pop rbp %endif ret SECTION_RODATA align 16 te0: times 16 db 0xe0 align 16 t7f: times 16 db 0x7f align 16 tfe: times 16 db 0xfe align 16 t1f: times 16 db 0x1f align 16 t80: times 16 db 0x80 align 16 t1: times 16 db 0x01 align 16 t3: times 16 db 0x03 align 16 t4: times 16 db 0x04