1 files changed, 0 insertions, 604 deletions
diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s
deleted file mode 100755
index 08821f5..0000000
--- a/common/arm/ih264_resi_trans_a9.s
+++ /dev/null
@@ -1,604 +0,0 @@
-@/******************************************************************************
-@ *
-@ * Copyright (C) 2015 The Android Open Source Project
-@ *
-@ * Licensed under the Apache License, Version 2.0 (the "License");
-@ * you may not use this file except in compliance with the License.
-@ * You may obtain a copy of the License at:
-@ *
-@ * http://www.apache.org/licenses/LICENSE-2.0
-@ *
-@ * Unless required by applicable law or agreed to in writing, software
-@ * distributed under the License is distributed on an "AS IS" BASIS,
-@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ * See the License for the specific language governing permissions and
-@ * limitations under the License.
-@ *
-@ *****************************************************************************
-@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
-@*/
-@/**
-@*******************************************************************************
-@* @file
-@*  ih264_resi_trans_a9.s
-@*
-@* @brief
-@*  Contains function definitions for residual and forward trans
-@*
-@* @author
-@*  Ittiam
-@*
-@* @par List of Functions:
-@*  ih264_resi_trans_4x4_a9
-@*  ih264_resi_trans_8x8_a9
-@* @remarks
-@*  None
-@*
-@*******************************************************************************
-
-
-.text
-.p2align 2
-@*****************************************************************************
-@*
-@* Function Name     : ih264_resi_trans_4x4_a9
-@* Description       : This function does cf4 of H264 followed by and approximate scaling
-@*
-@* Arguments         :
-@                       R0 :pointer to src buffer
-@                       R1 :pointer to pred buffer
-@                       R2 :pointer to dst buffer
-@                       R3 :src_stride
-@                       STACk :pred_stride,dst_stride
-
-@* Values Returned   : NONE
-@*
-@* Register Usage    :
-@* Stack Usage       :
-@* Cycles            : Around
-@* Interruptiaility  : Interruptable
-@*
-@* Known Limitations
-@*   \Assumptions    :
-@*
-@* Revision History  :
-@*         DD MM YYYY    Author(s)   Changes
-@*         30 12 2009    100633      First version
-@*
-@*****************************************************************************
-
-
-    .global ih264_resi_trans_4x4_a9
-    .extern g_scal_coff_h264_4x4
-g_scal_coff_h264_4x4_addr:
-    .long g_scal_coff_h264_4x4 - 4x4lbl - 8
-
-ih264_resi_trans_4x4_a9:
-
-    @R0 :pointer to src buffer
-    @R1 :pointer to pred buffer
-    @R2 :pointer to dst buffer
-    @R3 :src_stride
-    @STACk :pred_stride,dst_stride
-
-    push          {r4-r12, lr}          @push all the variables first
-
-    mov           r6, sp
-    add           r6, r6, #40           @decrement stack pointer,to accomodate two variables
-    ldmfd         r6, {r4-r5}           @load the strides into registers
-                                        @R4 pred_stride
-                                        @R5 dst_stride
-
-
-    @we have to give the stride as post inrement in VLDR1
-    @but since thr stride is from end of row 1 to start of row 2,
-    @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes)
-    @ADD R3,#4
-    @ADD R4,#4
-    @ADD R5,#4
-    @in case of dst the stride represnts 16 bit ie 2*8bits
-    @hence we need to add #4 to it and thenm multiply by 2
-    @--------------------function loading done------------------------
-
-    @lets find residual
-    @data is like 1a -> d0[1:31]  d0[32:64]
-    @                    a b c d   # # # #
-    vld1.u8       d30, [r0], r3         @load 4 pixels of row1 current buffer
-    vld1.u8       d31, [r1], r4         @load 4 pixels of row1 pred buffer
-    @ data is like 1a -> q4[1:63]  q4[64:148]
-    @                    d8[1:63]  d9[1:63]
-    @                    a b c d   # # # #
-
-    vld1.u8       d28, [r0], r3         @load row 2 of src to d28[0]
-    vld1.u8       d29, [r1], r4         @load row2 of pred to d29[0]
-
-    vld1.u8       d26, [r0], r3         @load row 3 of src to d26[0]
-    vsubl.u8      q0, d30, d31          @curr - pred for row one
-
-    vld1.u8       d27, [r1], r4         @load row 3of pred t0 d27[0]
-    vsubl.u8      q1, d28, d29          @find row 2 of src -pred to d0
-
-    vld1.u8       d24, [r0], r3         @load row 4 of src to d24[0]
-
-    vld1.u8       d25, [r1], r4         @load row 4 of src tp d25[0]
-    vsubl.u8      q2, d26, d27          @load src-pred row 3 to d[2]
-
-    lsl           r5, r5, #2            @ multiply dst stride by since we are storing 32 bit values
-    ldr           r6, g_scal_coff_h264_4x4_addr
-4x4lbl:
-    add           r6, r6, pc            @  load the address of global array
-
-    vsubl.u8      q3, d24, d25          @load row 4 of src - pred to q6
-
-    @after this
-    @D0  -> 1a
-    @D2 -> 2a
-    @D4 -> 3a
-    @D6 -> 4a
-
-    @transpose the matrix so that we can do the horizontal transform first
-    @#1 #2  #3  #4
-    @a  b   c   d       ---- D0
-    @e  f   g   h       -----D2
-    @i  j   k   l       -----D4
-    @m  n   o   p       -----D6
-    @transpose the inner 2x2 blocks
-    vtrn.16       d0, d2
-    vld1.s16      {q10}, [r6]!          @   load the scaling values 0-7;
-    vtrn.16       d4, d6
-    @a  e   c   g
-    @b  f   d   h
-    @i  m   k   o
-    @j  n   l   p
-    vtrn.32       d0, d4
-    vtrn.32       d2, d6
-    @a  e   i   m  #1  -- D0 --- x4
-    @b  f   j   n  #2  -- D2 --- x5
-    @c  g   k   o  #3  -- D4 ----x6
-    @d  h   l   p  #4  -- D6 ----x7
-
-    @we have loaded the residuals into the registers , now we need to add and subtract them
-    @let us do the horiz transform first
-
-    vsub.s16      d5, d2, d4            @x2 = x5-x6
-    vsub.s16      d7, d0, d6            @x3 = x4-x7;
-
-    vadd.s16      d3, d2, d4            @x1 = x5+x6
-    vadd.s16      d1, d0, d6            @x0 = x4+x7
-
-
-    vshl.s16      d31, d7, #1           @
-    vshl.s16      d30, d5, #1           @
-
-    vadd.s16      d0, d1, d3            @x0 + x1;
-    vsub.s16      d4, d1, d3            @x0 - x1;
-
-    vadd.s16      d2, d31, d5           @U_SHIFT(x3,1,shft) + x2;
-    vsub.s16      d6, d7, d30           @x3 - U_SHIFT(x2,1,shft);
-
-    @taking transform again so as to make do vert transform
-    vtrn.16       d0, d2
-    vtrn.16       d4, d6
-
-    vtrn.32       d0, d4
-    vtrn.32       d2, d6
-
-    @let us do vertical transform
-    @same code as horiz
-
-    vadd.s16      d1, d0, d6            @x0 = x4+x7
-    vadd.s16      d3, d2, d4            @x1 = x5+x6
-    vsub.s16      d7, d0, d6            @x3 = x4-x7;
-    vsub.s16      d5, d2, d4            @x2 = x5-x6
-
-
-@Since we are going to do scal / quant or whatever, we are going to divide by
-@a 32 bit number. So we have to expand the values
-
-    @VADDL.S16 Q12,D1,D3;x0 + x1
-    @VSUBL.S16 Q14,D1,D3;x0 - x1
-
-    @VSHL.S16  D8,D5,#1;
-    @VSHL.S16  D9,D7,#1;
-
-    @VADDL.S16 Q13,D9,D5 ; + x2
-    @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft)
-
-@scaling follows
-
-@now we need to do the scaling,so load the scaling matrix
-@mutliplying by the scaling coeffient; store the results from q5-q8 ;
-
-    vadd.s16      d24, d3, d1           @x4 = x0 + x1
-    vsub.s16      d28, d1, d3           @x6 = x0 - x1
-
-    vshl.s16      d0, d7, #1            @ U_SHIFT(x3,1,shft)
-    vmull.s16     q4, d24, d20          @x4*s0
-
-    vshl.s16      d2, d5, #1            @ U_SHIFT(x2,1,shft)
-
-    vadd.s16      d26, d0, d5           @x5 = U_SHIFT(x3,1,shft) + x2
-    vmull.s16     q5, d26, d21          @x5*s1
-
-    vst1.s32      {q4}, [r2], r5        @save 4 pixels of row1 current buffer and increment pointer by stride
-
-    vld1.s16      {q10}, [r6]           @load 8-16 scaling coeffcients
-
-    vsub.s16      d30, d7, d2           @x7 = x3 - U_SHIFT(x2,1,shft)
-
-    vmull.s16     q6, d28, d20          @x6*s2
-    vst1.s32      {q5}, [r2], r5
-
-    vmull.s16     q7, d30, d21          @x7*s3
-
-
-    vst1.s32      {q6}, [r2], r5
-    vst1.s32      {q7}, [r2]
-
-    pop           {r4-r12, pc}          @pop back all variables
-
-
-
-
-@*****************************************************************************
-@* Function Name     : ih264_resi_trans_8x8_a9
-@* Description       : This function does cf8 followd by an approximate normalization of H264
-@*
-@* Arguments         :
-@*                      R0 :pointer to src buffer
-@                       R1 :pointer to pred buffer
-@                       R2 :pointer to dst buffer
-@                       R3 :src_stride
-@                       STACk :pred_stride,dst_st
-@*
-@*
-@* Values Returned   : NONE
-@*
-@* Register Usage    :
-@* Stack Usage       :
-@* Cycles            : Around
-@* Interruptiaility  : Interruptable
-@*
-@* Known Limitations
-@*   \Assumptions    :
-@*
-@* Revision History  :
-@*         DD MM YYYY    Author(s)   Changes
-@*         30 12 2009    100633      First version
-@*
-@*****************************************************************************
-
-
-    .global ih264_resi_trans_8x8_a9
-    .extern g_scal_coff_h264_8x8
-g_scal_coff_h264_8x8_addr:
-    .long g_scal_coff_h264_8x8 - 8x8lbl - 8
-
-
-ih264_resi_trans_8x8_a9:
-
-    @R0 :pointer to src buffer
-    @R1 :pointer to pred buffer
-    @R2 :pointer to dst buffer
-    @R3 :src_stride
-    @STACk :pred_stride,dst_stride
-
-    push          {r4-r12, lr}          @push all the variables first
-
-    mov           r6, sp
-    add           r6, r6, #40           @decrement stack pointer,to accomodate two variables
-    ldmfd         r6, {r4-r5}           @load the strides into registers
-                                        @R4 pred_stride
-                                        @R5 dst_stride
-
-    @we have to give the stride as post inrement in vst1
-    @in case of dst the stride represnts 16 bit ie 2*8bits
-    @hence we need to add #4 to it and thenm multiply by 2
-    @--------------------function loading done------------------------
-
-    @lets find residual
-    @data is like 1a -> d0[1:31]  d0[32:64]
-    @                    a b c d   # # # #
-    vld1.u8       d30, [r0], r3         @load 4 pixels of row1 current buffer
-    vld1.u8       d31, [r1], r4         @load 4 pixels of row1 pred buffer
-
-    vld1.u8       d28, [r0], r3         @src  rw2
-    vld1.u8       d29, [r1], r4         @pred rw2
-    vsubl.u8      q0, d30, d31          @src-pred rw1
-
-    vld1.u8       d26, [r0], r3
-    vld1.u8       d27, [r1], r4
-    vsubl.u8      q1, d28, d29
-
-    vld1.u8       d24, [r0], r3
-    vld1.u8       d25, [r1], r4
-    vsubl.u8      q2, d26, d27
-
-    vld1.u8       d22, [r0], r3
-    vld1.u8       d23, [r1], r4
-    vsubl.u8      q3, d24, d25
-
-    vld1.u8       d20, [r0], r3
-    vld1.u8       d21, [r1], r4
-    vsubl.u8      q4, d22, d23
-
-    vld1.u8       d18, [r0], r3
-    vld1.u8       d19, [r1], r4
-    vsubl.u8      q5, d20, d21
-
-    vld1.u8       d16, [r0], r3
-    vld1.u8       d17, [r1], r4
-    vsubl.u8      q6, d18, d19
-
-    lsl           r5, r5, #2
-
-
-    vsubl.u8      q7, d16, d17
-
-    @after this
-    @Q0 -> 1a
-    @Q1 -> 2a
-    @Q2 -> 3a
-    @Q3 -> 4a
-    @Q4 -> 5a
-    @Q5 -> 6a
-    @Q6 -> 7a
-    @Q7 -> 8a
-
-    @transpose the matrix so that we can do the horizontal transform first
-
-    @transpose the inner 2x2 blocks
-    vtrn.16       q0, q1
-    vtrn.16       q2, q3
-    vtrn.16       q4, q5
-    vtrn.16       q6, q7
-
-    @transpose the inner 4x4 blocks
-    vtrn.32       q0, q2
-    vtrn.32       q1, q3
-
-    vtrn.32       q4, q6
-    vtrn.32       q5, q7
-
-    @transpose the outer 8x8 blocks
-    vswp          d1, d8
-    vswp          d7, d14
-    vswp          d3, d10
-    vswp          d5, d12
-    @transpose done
-
-@@this point we will have data in Q0-Q7
-@Q7 will be populated within 2 clock cycle
-@all others are availabe @ this clock cycle
-
-    @we have loaded the residuals into the registers , now we need to add and subtract them
-    @let us do the horiz transform first
-
-    vadd.s16      q8, q0, q7            @      a0 = r0 + r7;
-    vadd.s16      q9, q1, q6            @      a1 = r1 + r6;
-    vadd.s16      q10, q2, q5           @     a2 = r2 + r5;
-    vadd.s16      q11, q3, q4           @     a3 = r3 + r4;
-
-    vsub.s16      q12, q0, q7           @     b0 = r0 - r7;
-    vsub.s16      q13, q1, q6           @     b1 = r1 - r6;
-    vsub.s16      q15, q3, q4           @     b3 = r3 - r4;
-    vsub.s16      q14, q2, q5           @     b2 = r2 - r5;
-
-    vadd.s16      q1, q8, q11           @     a4 = a0 + a3;
-    vadd.s16      q3, q9, q10           @     a5 = a1 + a2;
-    vsub.s16      q7, q9, q10           @     a7 = a1 - a2;
-    vsub.s16      q5, q8, q11           @     a6 = a0 - a3;
-
-    ldr           r6, g_scal_coff_h264_8x8_addr
-8x8lbl:
-    add           r6, r6, pc            @  load the address of global array
-
-    vadd.s16      q0, q1, q3            @      pi2_res[0] = a4 + a5;
-    vshr.s16      q8, q7, #1            @      pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
-
-    vsub.s16      q4, q1, q3            @      pi2_res[4] = a4 - a5;
-
-    vadd.s16      q2, q5, q8            @
-
-
-    vshr.s16      q9, q5, #1            @      pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
-    vsub.s16      q6, q9, q7            @
-
-@do not change Q0,Q2.Q4,Q6 they contain results
-@Q1,Q3,Q5,Q7 TO STORE RESULTS
-@Q8 Q9 Q10 Q11 USE @WILL
-
-    vshr.s16      q1, q12, #1           @     D_SHIFT(b0,1,shft)
-    vshr.s16      q3, q13, #1           @     D_SHIFT(b1,1,shft)
-    vshr.s16      q5, q14, #1           @     D_SHIFT(b2,1,shft)
-    vshr.s16      q7, q15, #1           @     D_SHIFT(b3,1,shft)
-
-    vadd.s16      q8, q1, q12           @     (D_SHIFT(b0,1,shft) + b0);
-    vadd.s16      q9, q3, q13           @     (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q10, q5, q14          @    (D_SHIFT(b2,1,shft) + b2);
-    vadd.s16      q11, q7, q15          @    (D_SHIFT(b3,1,shft) + b3);
-
-    vadd.s16      q1, q14, q8           @     b2 + (D_SHIFT(b0,1,shft) + b0);
-    vsub.s16      q5, q15, q9           @     b3 - (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q3, q15, q10          @    b3 + (D_SHIFT(b2,1,shft) + b2);
-    vsub.s16      q7, q11, q14          @    -b2 + (D_SHIFT(b3,1,shft) + b3);
-
-    vadd.s16      q8, q13, q1           @     b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
-    vsub.s16      q9, q12, q3           @     b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
-    vadd.s16      q10, q12, q5          @    b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q11, q13, q7          @    b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
-
-    vshr.s16      q15, q8, #2           @     D_SHIFT(b4,2,shft)
-    vshr.s16      q14, q9, #2           @     D_SHIFT(b5,2,shft);
-    vshr.s16      q13, q10, #2          @    D_SHIFT(b6,2,shft);
-    vshr.s16      q12, q11, #2          @    D_SHIFT(b7,2,shft);
-
-
-    vadd.s16      q3, q9, q13           @     pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
-    vsub.s16      q5, q10, q14          @    pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
-    vadd.s16      q1, q8, q12           @     pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
-    vsub.s16      q7, q15, q11          @    pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
-
-    @------------horiz transform done-------------------------
-    @results are in Q0-Q7
-    @all other neon registes can be used at will
-
-@doing vertical transform
-@code exact copy of horiz transform above
-
-    @transpose the inner 2x2 blocks
-    vtrn.16       q0, q1
-    vtrn.16       q2, q3
-    vtrn.16       q4, q5
-    vtrn.16       q6, q7
-
-    @transpose the inner 4x4 blocks
-    vtrn.32       q0, q2
-    vtrn.32       q1, q3
-
-    vtrn.32       q4, q6
-    vtrn.32       q5, q7
-
-    @transpose the outer 8x8 blocks
-    vswp          d1, d8
-    vswp          d3, d10
-    vswp          d5, d12
-    vswp          d7, d14
-
-    @transpose done
-
-    vadd.s16      q8, q0, q7            @      a0 = r0 + r7;
-    vadd.s16      q9, q1, q6            @      a1 = r1 + r6;
-    vadd.s16      q10, q2, q5           @     a2 = r2 + r5;
-    vadd.s16      q11, q3, q4           @     a3 = r3 + r4;
-
-    vsub.s16      q12, q0, q7           @     b0 = r0 - r7;
-    vsub.s16      q13, q1, q6           @     b1 = r1 - r6;
-    vsub.s16      q14, q2, q5           @     b2 = r2 - r5;
-    vsub.s16      q15, q3, q4           @     b3 = r3 - r4;
-
-    vadd.s16      q1, q8, q11           @     a4 = a0 + a3;
-    vadd.s16      q3, q9, q10           @     a5 = a1 + a2;
-    vsub.s16      q5, q8, q11           @     a6 = a0 - a3;
-    vsub.s16      q7, q9, q10           @     a7 = a1 - a2;
-
-
-    vadd.s16      q0, q1, q3            @      pi2_res[0] = a4 + a5;
-
-    vshr.s16      q8, q7, #1            @      pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
-    @DSHIFT_TO_0 Q8,Q7,#1,#0
-    vadd.s16      q2, q5, q8            @
-
-    vsub.s16      q4, q1, q3            @      pi2_res[4] = a4 - a5;
-
-    vshr.s16      q9, q5, #1            @      pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
-    vsub.s16      q6, q9, q7            @
-
-@do not change Q0,Q2.Q4,Q6 they contain results
-@Q1,Q3,Q5,Q7 TO STORE RESULTS
-@Q8 Q9 Q10 Q11 USE @WILL
-
-    vshr.s16      q1, q12, #1           @     D_SHIFT(b0,1,shft)
-    vshr.s16      q3, q13, #1           @     D_SHIFT(b1,1,shft)
-    vshr.s16      q5, q14, #1           @     D_SHIFT(b2,1,shft)
-    vshr.s16      q7, q15, #1           @     D_SHIFT(b3,1,shft)
-
-
-    vadd.s16      q8, q1, q12           @     (D_SHIFT(b0,1,shft) + b0);
-    vadd.s16      q9, q3, q13           @     (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q10, q5, q14          @    (D_SHIFT(b2,1,shft) + b2);
-    vadd.s16      q11, q7, q15          @    (D_SHIFT(b3,1,shft) + b3);
-
-    vadd.s16      q1, q14, q8           @     b2 + (D_SHIFT(b0,1,shft) + b0);
-    vadd.s16      q3, q15, q10          @    b3 + (D_SHIFT(b2,1,shft) + b2);
-    vsub.s16      q5, q15, q9           @     b3 - (D_SHIFT(b1,1,shft) + b1);
-    vsub.s16      q7, q11, q14          @    -b2 + (D_SHIFT(b3,1,shft) + b3);
-
-    vadd.s16      q8, q13, q1           @     b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
-    vsub.s16      q9, q12, q3           @     b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
-    vadd.s16      q10, q12, q5          @    b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
-    vadd.s16      q11, q13, q7          @    b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
-
-    vshr.s16      q15, q8, #2           @     D_SHIFT(b4,2,shft)
-    vshr.s16      q14, q9, #2           @     D_SHIFT(b5,2,shft);
-    vshr.s16      q13, q10, #2          @    D_SHIFT(b6,2,shft);
-    vshr.s16      q12, q11, #2          @    D_SHIFT(b7,2,shft);
-
-
-@since we are going to scal by small values, we need not expand the guys to 32 bit bit values
-    vsub.s16      q5, q10, q14          @    pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
-    vsub.s16      q7, q15, q11          @    pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
-    vadd.s16      q3, q9, q13           @     pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
-    vadd.s16      q1, q8, q12           @     pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
-
-    @------------vert transform done-------------------------
-    @results are in Q0-Q7
-    @all other neon registes can be used at will
-
-    @scaling
-    @since the 8x8 scaling matrix repeats in 1x4,1x4 block ,
-    @we need only load 4 values for each row and in total 4 rows
-    vld1.s16      {q14-q15}, [r6]       @
-
-    @since we need to get a 32 bit o/p for two 16 bit multiplications
-    @we need a VMULL instruction
-@-----------------------------first and second row
-
-    vmull.s16     q8, d0, d28           @scale the first row first 4 elem
-    vmull.s16     q9, d28, d1           @scale the second row last 4 elemts
-
-    vmull.s16     q10, d2, d29          @ scale second row first 4 elem
-    vmull.s16     q11, d29, d3          @scale the second row last 4 elem
-    vmull.s16     q12, d4, d30          @scale third row first  4 elem
-
-    vst1.s32      {q8, q9}, [r2], r5    @ write the first row complete
-
-    vmull.s16     q13, d30, d5          @scale the third row last 4 elem
-    vmull.s16     q8, d6, d31           @scale the fourth row first 4 elem
-
-
-    vst1.s32      {q10, q11}, [r2], r5  @store the second row complete
-
-@------------------------------- 3rd and 4th row
-
-    vmull.s16     q9, d31, d7           @scale the fourth row second column
-
-    vst1.s32      {q12, q13}, [r2], r5  @store the third row complete
-
-    vmull.s16     q10, d8, d28          @scale the 5th row fisrst 4 elms
-    vmull.s16     q11, d28, d9          @scale the 5th row second 4 elems
-
-    vmull.s16     q12, d10, d29         @scale the 6th row first4 elements
-
-
-    vst1.s32      {q8, q9}, [r2], r5    @store fifth row
-
-@--------------------------------5th and 6th row
-
-    vmull.s16     q13, d29, d11         @scale 6th row sendond 4 elems
-
-    vmull.s16     q8, d12, d30          @scale 7th rw first 4 elms
-
-    vst1.s32      {q10, q11}, [r2], r5  @store 6th row second 4 elements
-
-    vmull.s16     q9, d30, d13          @scale 7th rw second 4 elms
-    vmull.s16     q10, d14, d31         @scale 8th rw forst 4 elms
-
-
-    vst1.s32      {q12, q13}, [r2], r5  @store 6th row
-
-@----------------------------------7th and 8th row
-    vmull.s16     q11, d31, d15         @scale 8th row second 4 elms
-
-    vst1.s32      {q8, q9}, [r2], r5    @store 7th row
-    vst1.s32      {q10, q11}, [r2], r5  @store 8th row
-
-@----------------------------------done writing
-
-    pop           {r4-r12, pc}          @pop back all variables
-
-
-
-
-
-