;
;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
;  Use of this source code is governed by a BSD-style license
;  that can be found in the LICENSE file in the root of the source
;  tree. An additional intellectual property rights grant can be found
;  in the file PATENTS.  All contributing project authors may
;  be found in the AUTHORS file in the root of the source tree.
;


%include "vpx_ports/x86_abi_support.asm"

;void vp9_post_proc_down_and_across_xmm
;(
;    unsigned char *src_ptr,
;    unsigned char *dst_ptr,
;    int src_pixels_per_line,
;    int dst_pixels_per_line,
;    int rows,
;    int cols,
;    int flimit
;)
global sym(vp9_post_proc_down_and_across_xmm) PRIVATE
sym(vp9_post_proc_down_and_across_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 7
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

%if ABI_IS_32BIT=1 && CONFIG_PIC=1
    ALIGN_STACK 16, rax
    ; move the global rd onto the stack, since we don't have enough registers
    ; to do PIC addressing
    movdqa      xmm0, [GLOBAL(rd42)]
    sub         rsp, 16
    movdqa      [rsp], xmm0
%define RD42 [rsp]
%else
%define RD42 [GLOBAL(rd42)]
%endif


        movd        xmm2,       dword ptr arg(6) ;flimit
        punpcklwd   xmm2,       xmm2
        punpckldq   xmm2,       xmm2
        punpcklqdq  xmm2,       xmm2

        mov         rsi,        arg(0) ;src_ptr
        mov         rdi,        arg(1) ;dst_ptr

        movsxd      rcx,        DWORD PTR arg(4) ;rows
        movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
        pxor        xmm0,       xmm0              ; mm0 = 00000000

.nextrow:

        xor         rdx,        rdx       ; clear out rdx for use as loop counter
.nextcol:
        movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
        punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
        movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
        psllw       xmm3,       2                       ;

        movq        xmm5,       QWORD PTR [rsi + rax]   ; mm4 = r1 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r1 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm6

        ; thresholding
        movdqa      xmm7,       xmm1                    ; mm7 = r0 p0..p3
        psubusw     xmm7,       xmm5                    ; mm7 = r0 p0..p3 - r1 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r1 p0..p3 - r0 p0..p3
        paddusw     xmm7,       xmm5                    ; mm7 = abs(r0 p0..p3 - r1 p0..p3)
        pcmpgtw     xmm7,       xmm2

        movq        xmm5,       QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r2 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm5                    ; mm6 = r0 p0..p3 - r2 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r2 p0..p3 - r2 p0..p3
        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r2 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds


        neg         rax
        movq        xmm5,       QWORD PTR [rsi+2*rax]   ; mm4 = r-2 p0..p7
        punpcklbw   xmm5,       xmm0                    ; mm5 = r-2 p0..p3
        paddusw     xmm3,       xmm5                    ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm5                    ; mm6 = p0..p3 - r-2 p0..p3
        psubusw     xmm5,       xmm1                    ; mm5 = r-2 p0..p3 - p0..p3
        paddusw     xmm6,       xmm5                    ; mm6 = abs(r0 p0..p3 - r-2 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds

        movq        xmm4,       QWORD PTR [rsi+rax]     ; mm4 = r-1 p0..p7
        punpcklbw   xmm4,       xmm0                    ; mm4 = r-1 p0..p3
        paddusw     xmm3,       xmm4                    ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1                    ; mm6 = r0 p0..p3
        psubusw     xmm6,       xmm4                    ; mm6 = p0..p3 - r-2 p0..p3
        psubusw     xmm4,       xmm1                    ; mm5 = r-1 p0..p3 - p0..p3
        paddusw     xmm6,       xmm4                    ; mm6 = abs(r0 p0..p3 - r-1 p0..p3)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6                    ; accumulate thresholds


        paddusw     xmm3,       RD42                    ; mm3 += round value
        psraw       xmm3,       3                       ; mm3 /= 8

        pand        xmm1,       xmm7                    ; mm1 select vals > thresh from source
        pandn       xmm7,       xmm3                    ; mm7 select vals < thresh from blurred result
        paddusw     xmm1,       xmm7                    ; combination

        packuswb    xmm1,       xmm0                    ; pack to bytes
        movq        QWORD PTR [rdi], xmm1             ;

        neg         rax                   ; pitch is positive
        add         rsi,        8
        add         rdi,        8

        add         rdx,        8
        cmp         edx,        dword arg(5) ;cols

        jl          .nextcol

        ; done with the all cols, start the across filtering in place
        sub         rsi,        rdx
        sub         rdi,        rdx

        xor         rdx,        rdx
        movq        mm0,        QWORD PTR [rdi-8];

.acrossnextcol:
        movq        xmm7,       QWORD PTR [rdi +rdx -2]
        movd        xmm4,       DWORD PTR [rdi +rdx +6]

        pslldq      xmm4,       8
        por         xmm4,       xmm7

        movdqa      xmm3,       xmm4
        psrldq      xmm3,       2
        punpcklbw   xmm3,       xmm0              ; mm3 = p0..p3
        movdqa      xmm1,       xmm3              ; mm1 = p0..p3
        psllw       xmm3,       2


        movdqa      xmm5,       xmm4
        psrldq      xmm5,       3
        punpcklbw   xmm5,       xmm0              ; mm5 = p1..p4
        paddusw     xmm3,       xmm5              ; mm3 += mm6

        ; thresholding
        movdqa      xmm7,       xmm1              ; mm7 = p0..p3
        psubusw     xmm7,       xmm5              ; mm7 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm7,       xmm5              ; mm7 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm7,       xmm2

        movdqa      xmm5,       xmm4
        psrldq      xmm5,       4
        punpcklbw   xmm5,       xmm0              ; mm5 = p2..p5
        paddusw     xmm3,       xmm5              ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds


        movdqa      xmm5,       xmm4              ; mm5 = p-2..p5
        punpcklbw   xmm5,       xmm0              ; mm5 = p-2..p1
        paddusw     xmm3,       xmm5              ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm5              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm5,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm5              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds

        psrldq      xmm4,       1                   ; mm4 = p-1..p5
        punpcklbw   xmm4,       xmm0              ; mm4 = p-1..p2
        paddusw     xmm3,       xmm4              ; mm3 += mm5

        ; thresholding
        movdqa      xmm6,       xmm1              ; mm6 = p0..p3
        psubusw     xmm6,       xmm4              ; mm6 = p0..p3 - p1..p4
        psubusw     xmm4,       xmm1              ; mm5 = p1..p4 - p0..p3
        paddusw     xmm6,       xmm4              ; mm6 = abs(p0..p3 - p1..p4)
        pcmpgtw     xmm6,       xmm2
        por         xmm7,       xmm6              ; accumulate thresholds

        paddusw     xmm3,       RD42              ; mm3 += round value
        psraw       xmm3,       3                 ; mm3 /= 8

        pand        xmm1,       xmm7              ; mm1 select vals > thresh from source
        pandn       xmm7,       xmm3              ; mm7 select vals < thresh from blurred result
        paddusw     xmm1,       xmm7              ; combination

        packuswb    xmm1,       xmm0              ; pack to bytes
        movq        QWORD PTR [rdi+rdx-8],  mm0   ; store previous four bytes
        movdq2q     mm0,        xmm1

        add         rdx,        8
        cmp         edx,        dword arg(5) ;cols
        jl          .acrossnextcol;

        ; last 8 pixels
        movq        QWORD PTR [rdi+rdx-8],  mm0

        ; done with this rwo
        add         rsi,rax               ; next line
        mov         eax, dword arg(3) ;dst_pixels_per_line ; destination pitch?
        add         rdi,rax               ; next destination
        mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?

        dec         rcx                   ; decrement count
        jnz         .nextrow              ; next row

%if ABI_IS_32BIT=1 && CONFIG_PIC=1
    add rsp,16
    pop rsp
%endif
    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
%undef RD42


;void vp9_mbpost_proc_down_xmm(unsigned char *dst,
;                            int pitch, int rows, int cols,int flimit)
extern sym(vp9_rv)
global sym(vp9_mbpost_proc_down_xmm) PRIVATE
sym(vp9_mbpost_proc_down_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 128+16

    ; unsigned char d[16][8] at [rsp]
    ; create flimit2 at [rsp+128]
    mov         eax, dword ptr arg(4) ;flimit
    mov         [rsp+128], eax
    mov         [rsp+128+4], eax
    mov         [rsp+128+8], eax
    mov         [rsp+128+12], eax
%define flimit4 [rsp+128]

%if ABI_IS_32BIT=0
    lea         r8,       [GLOBAL(sym(vp9_rv))]
%endif

    ;rows +=8;
    add         dword arg(2), 8

    ;for(c=0; c<cols; c+=8)
.loop_col:
            mov         rsi,        arg(0) ; s
            pxor        xmm0,       xmm0        ;

            movsxd      rax,        dword ptr arg(1) ;pitch       ;
            neg         rax                                     ; rax = -pitch

            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
            neg         rax


            pxor        xmm5,       xmm5
            pxor        xmm6,       xmm6        ;

            pxor        xmm7,       xmm7        ;
            mov         rdi,        rsi

            mov         rcx,        15          ;

.loop_initvar:
            movq        xmm1,       QWORD PTR [rdi];
            punpcklbw   xmm1,       xmm0        ;

            paddw       xmm5,       xmm1        ;
            pmullw      xmm1,       xmm1        ;

            movdqa      xmm2,       xmm1        ;
            punpcklwd   xmm1,       xmm0        ;

            punpckhwd   xmm2,       xmm0        ;
            paddd       xmm6,       xmm1        ;

            paddd       xmm7,       xmm2        ;
            lea         rdi,        [rdi+rax]   ;

            dec         rcx
            jne         .loop_initvar
            ;save the var and sum
            xor         rdx,        rdx
.loop_row:
            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]

            punpcklbw   xmm1,       xmm0
            punpcklbw   xmm2,       xmm0

            paddw       xmm5,       xmm2
            psubw       xmm5,       xmm1

            pmullw      xmm2,       xmm2
            movdqa      xmm4,       xmm2

            punpcklwd   xmm2,       xmm0
            punpckhwd   xmm4,       xmm0

            paddd       xmm6,       xmm2
            paddd       xmm7,       xmm4

            pmullw      xmm1,       xmm1
            movdqa      xmm2,       xmm1

            punpcklwd   xmm1,       xmm0
            psubd       xmm6,       xmm1

            punpckhwd   xmm2,       xmm0
            psubd       xmm7,       xmm2


            movdqa      xmm3,       xmm6
            pslld       xmm3,       4

            psubd       xmm3,       xmm6
            movdqa      xmm1,       xmm5

            movdqa      xmm4,       xmm5
            pmullw      xmm1,       xmm1

            pmulhw      xmm4,       xmm4
            movdqa      xmm2,       xmm1

            punpcklwd   xmm1,       xmm4
            punpckhwd   xmm2,       xmm4

            movdqa      xmm4,       xmm7
            pslld       xmm4,       4

            psubd       xmm4,       xmm7

            psubd       xmm3,       xmm1
            psubd       xmm4,       xmm2

            psubd       xmm3,       flimit4
            psubd       xmm4,       flimit4

            psrad       xmm3,       31
            psrad       xmm4,       31

            packssdw    xmm3,       xmm4
            packsswb    xmm3,       xmm0

            movq        xmm1,       QWORD PTR [rsi+rax*8]

            movq        xmm2,       xmm1
            punpcklbw   xmm1,       xmm0

            paddw       xmm1,       xmm5
            mov         rcx,        rdx

            and         rcx,        127
%if ABI_IS_32BIT=1 && CONFIG_PIC=1
            push        rax
            lea         rax,        [GLOBAL(sym(vp9_rv))]
            movdqu      xmm4,       [rax + rcx*2] ;vp9_rv[rcx*2]
            pop         rax
%elif ABI_IS_32BIT=0
            movdqu      xmm4,       [r8 + rcx*2] ;vp9_rv[rcx*2]
%else
            movdqu      xmm4,       [sym(vp9_rv) + rcx*2]
%endif

            paddw       xmm1,       xmm4
            ;paddw     xmm1,       eight8s
            psraw       xmm1,       4

            packuswb    xmm1,       xmm0
            pand        xmm1,       xmm3

            pandn       xmm3,       xmm2
            por         xmm1,       xmm3

            and         rcx,        15
            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]

            mov         rcx,        rdx
            sub         rcx,        8

            and         rcx,        15
            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]

            movq        [rsi],      mm0
            lea         rsi,        [rsi+rax]

            lea         rdi,        [rdi+rax]
            add         rdx,        1

            cmp         edx,        dword arg(2) ;rows
            jl          .loop_row

        add         dword arg(0), 8 ; s += 8
        sub         dword arg(3), 8 ; cols -= 8
        cmp         dword arg(3), 0
        jg          .loop_col

    add         rsp, 128+16
    pop         rsp

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
%undef flimit4


;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src,
;                                int pitch, int rows, int cols,int flimit)
global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE
sym(vp9_mbpost_proc_across_ip_xmm):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 5
    SAVE_XMM 7
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

    ALIGN_STACK 16, rax
    sub         rsp, 16

    ; create flimit4 at [rsp]
    mov         eax, dword ptr arg(4) ;flimit
    mov         [rsp], eax
    mov         [rsp+4], eax
    mov         [rsp+8], eax
    mov         [rsp+12], eax
%define flimit4 [rsp]


    ;for(r=0;r<rows;r++)
.ip_row_loop:

        xor         rdx,    rdx ;sumsq=0;
        xor         rcx,    rcx ;sum=0;
        mov         rsi,    arg(0); s
        mov         rdi,    -8
.ip_var_loop:
        ;for(i=-8;i<=6;i++)
        ;{
        ;    sumsq += s[i]*s[i];
        ;    sum   += s[i];
        ;}
        movzx       eax, byte [rsi+rdi]
        add         ecx, eax
        mul         al
        add         edx, eax
        add         rdi, 1
        cmp         rdi, 6
        jle         .ip_var_loop


            ;mov         rax,    sumsq
            ;movd        xmm7,   rax
            movd        xmm7,   edx

            ;mov         rax,    sum
            ;movd        xmm6,   rax
            movd        xmm6,   ecx

            mov         rsi,    arg(0) ;s
            xor         rcx,    rcx

            movsxd      rdx,    dword arg(3) ;cols
            add         rdx,    8
            pxor        mm0,    mm0
            pxor        mm1,    mm1

            pxor        xmm0,   xmm0
.nextcol4:

            movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
            movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10

            punpcklbw   xmm1,   xmm0                    ; expanding
            punpcklbw   xmm2,   xmm0                    ; expanding

            punpcklwd   xmm1,   xmm0                    ; expanding to dwords
            punpcklwd   xmm2,   xmm0                    ; expanding to dwords

            psubd       xmm2,   xmm1                    ; 7--8   8--7   9--6 10--5
            paddd       xmm1,   xmm1                    ; -8*2   -7*2   -6*2 -5*2

            paddd       xmm1,   xmm2                    ; 7+-8   8+-7   9+-6 10+-5
            pmaddwd     xmm1,   xmm2                    ; squared of 7+-8   8+-7   9+-6 10+-5

            paddd       xmm6,   xmm2
            paddd       xmm7,   xmm1

            pshufd      xmm6,   xmm6,   0               ; duplicate the last ones
            pshufd      xmm7,   xmm7,   0               ; duplicate the last ones

            psrldq      xmm1,       4                   ; 8--7   9--6 10--5  0000
            psrldq      xmm2,       4                   ; 8--7   9--6 10--5  0000

            pshufd      xmm3,   xmm1,   3               ; 0000  8--7   8--7   8--7 squared
            pshufd      xmm4,   xmm2,   3               ; 0000  8--7   8--7   8--7 squared

            paddd       xmm6,   xmm4
            paddd       xmm7,   xmm3

            pshufd      xmm3,   xmm1,   01011111b       ; 0000  0000   9--6   9--6 squared
            pshufd      xmm4,   xmm2,   01011111b       ; 0000  0000   9--6   9--6 squared

            paddd       xmm7,   xmm3
            paddd       xmm6,   xmm4

            pshufd      xmm3,   xmm1,   10111111b       ; 0000  0000   8--7   8--7 squared
            pshufd      xmm4,   xmm2,   10111111b       ; 0000  0000   8--7   8--7 squared

            paddd       xmm7,   xmm3
            paddd       xmm6,   xmm4

            movdqa      xmm3,   xmm6
            pmaddwd     xmm3,   xmm3

            movdqa      xmm5,   xmm7
            pslld       xmm5,   4

            psubd       xmm5,   xmm7
            psubd       xmm5,   xmm3

            psubd       xmm5,   flimit4
            psrad       xmm5,   31

            packssdw    xmm5,   xmm0
            packsswb    xmm5,   xmm0

            movd        xmm1,   DWORD PTR [rsi+rcx]
            movq        xmm2,   xmm1

            punpcklbw   xmm1,   xmm0
            punpcklwd   xmm1,   xmm0

            paddd       xmm1,   xmm6
            paddd       xmm1,   [GLOBAL(four8s)]

            psrad       xmm1,   4
            packssdw    xmm1,   xmm0

            packuswb    xmm1,   xmm0
            pand        xmm1,   xmm5

            pandn       xmm5,   xmm2
            por         xmm5,   xmm1

            movd        [rsi+rcx-8],  mm0
            movq        mm0,    mm1

            movdq2q     mm1,    xmm5
            psrldq      xmm7,   12

            psrldq      xmm6,   12
            add         rcx,    4

            cmp         rcx,    rdx
            jl          .nextcol4

        ;s+=pitch;
        movsxd rax, dword arg(1)
        add    arg(0), rax

        sub dword arg(2), 1 ;rows-=1
        cmp dword arg(2), 0
        jg .ip_row_loop

    add         rsp, 16
    pop         rsp

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    RESTORE_XMM
    UNSHADOW_ARGS
    pop         rbp
    ret
%undef flimit4


;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
;                            unsigned char blackclamp[16],
;                            unsigned char whiteclamp[16],
;                            unsigned char bothclamp[16],
;                            unsigned int width, unsigned int height, int pitch)
global sym(vp9_plane_add_noise_wmt) PRIVATE
sym(vp9_plane_add_noise_wmt):
    push        rbp
    mov         rbp, rsp
    SHADOW_ARGS_TO_STACK 8
    GET_GOT     rbx
    push        rsi
    push        rdi
    ; end prolog

.addnoise_loop:
    call sym(LIBVPX_RAND) WRT_PLT
    mov     rcx, arg(1) ;noise
    and     rax, 0xff
    add     rcx, rax

    ; we rely on the fact that the clamping vectors are stored contiguously
    ; in black/white/both order. Note that we have to reload this here because
    ; rdx could be trashed by rand()
    mov     rdx, arg(2) ; blackclamp


            mov     rdi, rcx
            movsxd  rcx, dword arg(5) ;[Width]
            mov     rsi, arg(0) ;Pos
            xor         rax,rax

.addnoise_nextset:
            movdqu      xmm1,[rsi+rax]         ; get the source

            psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
            paddusb     xmm1, [rdx+32] ;bothclamp
            psubusb     xmm1, [rdx+16] ;whiteclamp

            movdqu      xmm2,[rdi+rax]         ; get the noise for this line
            paddb       xmm1,xmm2              ; add it in
            movdqu      [rsi+rax],xmm1         ; store the result

            add         rax,16                 ; move to the next line

            cmp         rax, rcx
            jl          .addnoise_nextset

    movsxd  rax, dword arg(7) ; Pitch
    add     arg(0), rax ; Start += Pitch
    sub     dword arg(6), 1   ; Height -= 1
    jg      .addnoise_loop

    ; begin epilog
    pop rdi
    pop rsi
    RESTORE_GOT
    UNSHADOW_ARGS
    pop         rbp
    ret


SECTION_RODATA
align 16
rd42:
    times 8 dw 0x04
four8s:
    times 4 dd 8