summaryrefslogtreecommitdiffstats
path: root/common/arm/ih264_resi_trans_a9.s
diff options
context:
space:
mode:
Diffstat (limited to 'common/arm/ih264_resi_trans_a9.s')
-rwxr-xr-xcommon/arm/ih264_resi_trans_a9.s604
1 files changed, 0 insertions, 604 deletions
diff --git a/common/arm/ih264_resi_trans_a9.s b/common/arm/ih264_resi_trans_a9.s
deleted file mode 100755
index 08821f5..0000000
--- a/common/arm/ih264_resi_trans_a9.s
+++ /dev/null
@@ -1,604 +0,0 @@
-@/******************************************************************************
-@ *
-@ * Copyright (C) 2015 The Android Open Source Project
-@ *
-@ * Licensed under the Apache License, Version 2.0 (the "License");
-@ * you may not use this file except in compliance with the License.
-@ * You may obtain a copy of the License at:
-@ *
-@ * http://www.apache.org/licenses/LICENSE-2.0
-@ *
-@ * Unless required by applicable law or agreed to in writing, software
-@ * distributed under the License is distributed on an "AS IS" BASIS,
-@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-@ * See the License for the specific language governing permissions and
-@ * limitations under the License.
-@ *
-@ *****************************************************************************
-@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
-@*/
-@/**
-@*******************************************************************************
-@* @file
-@* ih264_resi_trans_a9.s
-@*
-@* @brief
-@* Contains function definitions for residual and forward trans
-@*
-@* @author
-@* Ittiam
-@*
-@* @par List of Functions:
-@* ih264_resi_trans_4x4_a9
-@* ih264_resi_trans_8x8_a9
-@* @remarks
-@* None
-@*
-@*******************************************************************************
-
-
-.text
-.p2align 2
-@*****************************************************************************
-@*
-@* Function Name : ih264_resi_trans_4x4_a9
-@* Description : This function does cf4 of H264 followed by and approximate scaling
-@*
-@* Arguments :
-@ R0 :pointer to src buffer
-@ R1 :pointer to pred buffer
-@ R2 :pointer to dst buffer
-@ R3 :src_stride
-@ STACk :pred_stride,dst_stride
-
-@* Values Returned : NONE
-@*
-@* Register Usage :
-@* Stack Usage :
-@* Cycles : Around
-@* Interruptiaility : Interruptable
-@*
-@* Known Limitations
-@* \Assumptions :
-@*
-@* Revision History :
-@* DD MM YYYY Author(s) Changes
-@* 30 12 2009 100633 First version
-@*
-@*****************************************************************************
-
-
- .global ih264_resi_trans_4x4_a9
- .extern g_scal_coff_h264_4x4
-g_scal_coff_h264_4x4_addr:
- .long g_scal_coff_h264_4x4 - 4x4lbl - 8
-
-ih264_resi_trans_4x4_a9:
-
- @R0 :pointer to src buffer
- @R1 :pointer to pred buffer
- @R2 :pointer to dst buffer
- @R3 :src_stride
- @STACk :pred_stride,dst_stride
-
- push {r4-r12, lr} @push all the variables first
-
- mov r6, sp
- add r6, r6, #40 @decrement stack pointer,to accomodate two variables
- ldmfd r6, {r4-r5} @load the strides into registers
- @R4 pred_stride
- @R5 dst_stride
-
-
- @we have to give the stride as post inrement in VLDR1
- @but since thr stride is from end of row 1 to start of row 2,
- @we need to add the size of the curent row to strides ie we need to add 4 to it (4 bytes)
- @ADD R3,#4
- @ADD R4,#4
- @ADD R5,#4
- @in case of dst the stride represnts 16 bit ie 2*8bits
- @hence we need to add #4 to it and thenm multiply by 2
- @--------------------function loading done------------------------
-
- @lets find residual
- @data is like 1a -> d0[1:31] d0[32:64]
- @ a b c d # # # #
- vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
- vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
- @ data is like 1a -> q4[1:63] q4[64:148]
- @ d8[1:63] d9[1:63]
- @ a b c d # # # #
-
- vld1.u8 d28, [r0], r3 @load row 2 of src to d28[0]
- vld1.u8 d29, [r1], r4 @load row2 of pred to d29[0]
-
- vld1.u8 d26, [r0], r3 @load row 3 of src to d26[0]
- vsubl.u8 q0, d30, d31 @curr - pred for row one
-
- vld1.u8 d27, [r1], r4 @load row 3of pred t0 d27[0]
- vsubl.u8 q1, d28, d29 @find row 2 of src -pred to d0
-
- vld1.u8 d24, [r0], r3 @load row 4 of src to d24[0]
-
- vld1.u8 d25, [r1], r4 @load row 4 of src tp d25[0]
- vsubl.u8 q2, d26, d27 @load src-pred row 3 to d[2]
-
- lsl r5, r5, #2 @ multiply dst stride by since we are storing 32 bit values
- ldr r6, g_scal_coff_h264_4x4_addr
-4x4lbl:
- add r6, r6, pc @ load the address of global array
-
- vsubl.u8 q3, d24, d25 @load row 4 of src - pred to q6
-
- @after this
- @D0 -> 1a
- @D2 -> 2a
- @D4 -> 3a
- @D6 -> 4a
-
- @transpose the matrix so that we can do the horizontal transform first
- @#1 #2 #3 #4
- @a b c d ---- D0
- @e f g h -----D2
- @i j k l -----D4
- @m n o p -----D6
- @transpose the inner 2x2 blocks
- vtrn.16 d0, d2
- vld1.s16 {q10}, [r6]! @ load the scaling values 0-7;
- vtrn.16 d4, d6
- @a e c g
- @b f d h
- @i m k o
- @j n l p
- vtrn.32 d0, d4
- vtrn.32 d2, d6
- @a e i m #1 -- D0 --- x4
- @b f j n #2 -- D2 --- x5
- @c g k o #3 -- D4 ----x6
- @d h l p #4 -- D6 ----x7
-
- @we have loaded the residuals into the registers , now we need to add and subtract them
- @let us do the horiz transform first
-
- vsub.s16 d5, d2, d4 @x2 = x5-x6
- vsub.s16 d7, d0, d6 @x3 = x4-x7;
-
- vadd.s16 d3, d2, d4 @x1 = x5+x6
- vadd.s16 d1, d0, d6 @x0 = x4+x7
-
-
- vshl.s16 d31, d7, #1 @
- vshl.s16 d30, d5, #1 @
-
- vadd.s16 d0, d1, d3 @x0 + x1;
- vsub.s16 d4, d1, d3 @x0 - x1;
-
- vadd.s16 d2, d31, d5 @U_SHIFT(x3,1,shft) + x2;
- vsub.s16 d6, d7, d30 @x3 - U_SHIFT(x2,1,shft);
-
- @taking transform again so as to make do vert transform
- vtrn.16 d0, d2
- vtrn.16 d4, d6
-
- vtrn.32 d0, d4
- vtrn.32 d2, d6
-
- @let us do vertical transform
- @same code as horiz
-
- vadd.s16 d1, d0, d6 @x0 = x4+x7
- vadd.s16 d3, d2, d4 @x1 = x5+x6
- vsub.s16 d7, d0, d6 @x3 = x4-x7;
- vsub.s16 d5, d2, d4 @x2 = x5-x6
-
-
-@Since we are going to do scal / quant or whatever, we are going to divide by
-@a 32 bit number. So we have to expand the values
-
- @VADDL.S16 Q12,D1,D3;x0 + x1
- @VSUBL.S16 Q14,D1,D3;x0 - x1
-
- @VSHL.S16 D8,D5,#1;
- @VSHL.S16 D9,D7,#1;
-
- @VADDL.S16 Q13,D9,D5 ; + x2
- @VSUBL.S16 Q15,D7,D8 ;x3 - U_SHIFT(x2,1,shft)
-
-@scaling follows
-
-@now we need to do the scaling,so load the scaling matrix
-@mutliplying by the scaling coeffient; store the results from q5-q8 ;
-
- vadd.s16 d24, d3, d1 @x4 = x0 + x1
- vsub.s16 d28, d1, d3 @x6 = x0 - x1
-
- vshl.s16 d0, d7, #1 @ U_SHIFT(x3,1,shft)
- vmull.s16 q4, d24, d20 @x4*s0
-
- vshl.s16 d2, d5, #1 @ U_SHIFT(x2,1,shft)
-
- vadd.s16 d26, d0, d5 @x5 = U_SHIFT(x3,1,shft) + x2
- vmull.s16 q5, d26, d21 @x5*s1
-
- vst1.s32 {q4}, [r2], r5 @save 4 pixels of row1 current buffer and increment pointer by stride
-
- vld1.s16 {q10}, [r6] @load 8-16 scaling coeffcients
-
- vsub.s16 d30, d7, d2 @x7 = x3 - U_SHIFT(x2,1,shft)
-
- vmull.s16 q6, d28, d20 @x6*s2
- vst1.s32 {q5}, [r2], r5
-
- vmull.s16 q7, d30, d21 @x7*s3
-
-
- vst1.s32 {q6}, [r2], r5
- vst1.s32 {q7}, [r2]
-
- pop {r4-r12, pc} @pop back all variables
-
-
-
-
-@*****************************************************************************
-@* Function Name : ih264_resi_trans_8x8_a9
-@* Description : This function does cf8 followd by an approximate normalization of H264
-@*
-@* Arguments :
-@* R0 :pointer to src buffer
-@ R1 :pointer to pred buffer
-@ R2 :pointer to dst buffer
-@ R3 :src_stride
-@ STACk :pred_stride,dst_st
-@*
-@*
-@* Values Returned : NONE
-@*
-@* Register Usage :
-@* Stack Usage :
-@* Cycles : Around
-@* Interruptiaility : Interruptable
-@*
-@* Known Limitations
-@* \Assumptions :
-@*
-@* Revision History :
-@* DD MM YYYY Author(s) Changes
-@* 30 12 2009 100633 First version
-@*
-@*****************************************************************************
-
-
- .global ih264_resi_trans_8x8_a9
- .extern g_scal_coff_h264_8x8
-g_scal_coff_h264_8x8_addr:
- .long g_scal_coff_h264_8x8 - 8x8lbl - 8
-
-
-ih264_resi_trans_8x8_a9:
-
- @R0 :pointer to src buffer
- @R1 :pointer to pred buffer
- @R2 :pointer to dst buffer
- @R3 :src_stride
- @STACk :pred_stride,dst_stride
-
- push {r4-r12, lr} @push all the variables first
-
- mov r6, sp
- add r6, r6, #40 @decrement stack pointer,to accomodate two variables
- ldmfd r6, {r4-r5} @load the strides into registers
- @R4 pred_stride
- @R5 dst_stride
-
- @we have to give the stride as post inrement in vst1
- @in case of dst the stride represnts 16 bit ie 2*8bits
- @hence we need to add #4 to it and thenm multiply by 2
- @--------------------function loading done------------------------
-
- @lets find residual
- @data is like 1a -> d0[1:31] d0[32:64]
- @ a b c d # # # #
- vld1.u8 d30, [r0], r3 @load 4 pixels of row1 current buffer
- vld1.u8 d31, [r1], r4 @load 4 pixels of row1 pred buffer
-
- vld1.u8 d28, [r0], r3 @src rw2
- vld1.u8 d29, [r1], r4 @pred rw2
- vsubl.u8 q0, d30, d31 @src-pred rw1
-
- vld1.u8 d26, [r0], r3
- vld1.u8 d27, [r1], r4
- vsubl.u8 q1, d28, d29
-
- vld1.u8 d24, [r0], r3
- vld1.u8 d25, [r1], r4
- vsubl.u8 q2, d26, d27
-
- vld1.u8 d22, [r0], r3
- vld1.u8 d23, [r1], r4
- vsubl.u8 q3, d24, d25
-
- vld1.u8 d20, [r0], r3
- vld1.u8 d21, [r1], r4
- vsubl.u8 q4, d22, d23
-
- vld1.u8 d18, [r0], r3
- vld1.u8 d19, [r1], r4
- vsubl.u8 q5, d20, d21
-
- vld1.u8 d16, [r0], r3
- vld1.u8 d17, [r1], r4
- vsubl.u8 q6, d18, d19
-
- lsl r5, r5, #2
-
-
- vsubl.u8 q7, d16, d17
-
- @after this
- @Q0 -> 1a
- @Q1 -> 2a
- @Q2 -> 3a
- @Q3 -> 4a
- @Q4 -> 5a
- @Q5 -> 6a
- @Q6 -> 7a
- @Q7 -> 8a
-
- @transpose the matrix so that we can do the horizontal transform first
-
- @transpose the inner 2x2 blocks
- vtrn.16 q0, q1
- vtrn.16 q2, q3
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- @transpose the inner 4x4 blocks
- vtrn.32 q0, q2
- vtrn.32 q1, q3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
-
- @transpose the outer 8x8 blocks
- vswp d1, d8
- vswp d7, d14
- vswp d3, d10
- vswp d5, d12
- @transpose done
-
-@@this point we will have data in Q0-Q7
-@Q7 will be populated within 2 clock cycle
-@all others are availabe @ this clock cycle
-
- @we have loaded the residuals into the registers , now we need to add and subtract them
- @let us do the horiz transform first
-
- vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
- vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
- vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
- vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
-
- vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
- vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
- vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
- vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
-
- vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
- vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
- vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
- vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
-
- ldr r6, g_scal_coff_h264_8x8_addr
-8x8lbl:
- add r6, r6, pc @ load the address of global array
-
- vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
- vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
-
- vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
-
- vadd.s16 q2, q5, q8 @
-
-
- vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
- vsub.s16 q6, q9, q7 @
-
-@do not change Q0,Q2.Q4,Q6 they contain results
-@Q1,Q3,Q5,Q7 TO STORE RESULTS
-@Q8 Q9 Q10 Q11 USE @WILL
-
- vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
- vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
- vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
- vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
-
- vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
- vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
- vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
-
- vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
- vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
- vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
-
- vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
- vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
- vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
-
- vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
- vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
- vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
- vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
-
-
- vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
- vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
- vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
- vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
-
- @------------horiz transform done-------------------------
- @results are in Q0-Q7
- @all other neon registes can be used at will
-
-@doing vertical transform
-@code exact copy of horiz transform above
-
- @transpose the inner 2x2 blocks
- vtrn.16 q0, q1
- vtrn.16 q2, q3
- vtrn.16 q4, q5
- vtrn.16 q6, q7
-
- @transpose the inner 4x4 blocks
- vtrn.32 q0, q2
- vtrn.32 q1, q3
-
- vtrn.32 q4, q6
- vtrn.32 q5, q7
-
- @transpose the outer 8x8 blocks
- vswp d1, d8
- vswp d3, d10
- vswp d5, d12
- vswp d7, d14
-
- @transpose done
-
- vadd.s16 q8, q0, q7 @ a0 = r0 + r7;
- vadd.s16 q9, q1, q6 @ a1 = r1 + r6;
- vadd.s16 q10, q2, q5 @ a2 = r2 + r5;
- vadd.s16 q11, q3, q4 @ a3 = r3 + r4;
-
- vsub.s16 q12, q0, q7 @ b0 = r0 - r7;
- vsub.s16 q13, q1, q6 @ b1 = r1 - r6;
- vsub.s16 q14, q2, q5 @ b2 = r2 - r5;
- vsub.s16 q15, q3, q4 @ b3 = r3 - r4;
-
- vadd.s16 q1, q8, q11 @ a4 = a0 + a3;
- vadd.s16 q3, q9, q10 @ a5 = a1 + a2;
- vsub.s16 q5, q8, q11 @ a6 = a0 - a3;
- vsub.s16 q7, q9, q10 @ a7 = a1 - a2;
-
-
- vadd.s16 q0, q1, q3 @ pi2_res[0] = a4 + a5;
-
- vshr.s16 q8, q7, #1 @ pi2_res[2] = a6 + D_SHIFT(a7,1,shft);
- @DSHIFT_TO_0 Q8,Q7,#1,#0
- vadd.s16 q2, q5, q8 @
-
- vsub.s16 q4, q1, q3 @ pi2_res[4] = a4 - a5;
-
- vshr.s16 q9, q5, #1 @ pi2_res[6] = D_SHIFT(a6,1,shft) - a7;
- vsub.s16 q6, q9, q7 @
-
-@do not change Q0,Q2.Q4,Q6 they contain results
-@Q1,Q3,Q5,Q7 TO STORE RESULTS
-@Q8 Q9 Q10 Q11 USE @WILL
-
- vshr.s16 q1, q12, #1 @ D_SHIFT(b0,1,shft)
- vshr.s16 q3, q13, #1 @ D_SHIFT(b1,1,shft)
- vshr.s16 q5, q14, #1 @ D_SHIFT(b2,1,shft)
- vshr.s16 q7, q15, #1 @ D_SHIFT(b3,1,shft)
-
-
- vadd.s16 q8, q1, q12 @ (D_SHIFT(b0,1,shft) + b0);
- vadd.s16 q9, q3, q13 @ (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q10, q5, q14 @ (D_SHIFT(b2,1,shft) + b2);
- vadd.s16 q11, q7, q15 @ (D_SHIFT(b3,1,shft) + b3);
-
- vadd.s16 q1, q14, q8 @ b2 + (D_SHIFT(b0,1,shft) + b0);
- vadd.s16 q3, q15, q10 @ b3 + (D_SHIFT(b2,1,shft) + b2);
- vsub.s16 q5, q15, q9 @ b3 - (D_SHIFT(b1,1,shft) + b1);
- vsub.s16 q7, q11, q14 @ -b2 + (D_SHIFT(b3,1,shft) + b3);
-
- vadd.s16 q8, q13, q1 @ b4 = b1 + b2 + (D_SHIFT(b0,1,shft) + b0);
- vsub.s16 q9, q12, q3 @ b5 = b0 - b3 - (D_SHIFT(b2,1,shft) + b2);
- vadd.s16 q10, q12, q5 @ b6 = b0 + b3 - (D_SHIFT(b1,1,shft) + b1);
- vadd.s16 q11, q13, q7 @ b7 = b1 - b2 + (D_SHIFT(b3,1,shft) + b3);
-
- vshr.s16 q15, q8, #2 @ D_SHIFT(b4,2,shft)
- vshr.s16 q14, q9, #2 @ D_SHIFT(b5,2,shft);
- vshr.s16 q13, q10, #2 @ D_SHIFT(b6,2,shft);
- vshr.s16 q12, q11, #2 @ D_SHIFT(b7,2,shft);
-
-
-@since we are going to scal by small values, we need not expand the guys to 32 bit bit values
- vsub.s16 q5, q10, q14 @ pi2_res[5] = b6 - D_SHIFT(b5,2,shft);
- vsub.s16 q7, q15, q11 @ pi2_res[7] = D_SHIFT(b4,2,shft) - b7;
- vadd.s16 q3, q9, q13 @ pi2_res[3] = b5 + D_SHIFT(b6,2,shft);
- vadd.s16 q1, q8, q12 @ pi2_res[1] = b4 + D_SHIFT(b7,2,shft);
-
- @------------vert transform done-------------------------
- @results are in Q0-Q7
- @all other neon registes can be used at will
-
- @scaling
- @since the 8x8 scaling matrix repeats in 1x4,1x4 block ,
- @we need only load 4 values for each row and in total 4 rows
- vld1.s16 {q14-q15}, [r6] @
-
- @since we need to get a 32 bit o/p for two 16 bit multiplications
- @we need a VMULL instruction
-@-----------------------------first and second row
-
- vmull.s16 q8, d0, d28 @scale the first row first 4 elem
- vmull.s16 q9, d28, d1 @scale the second row last 4 elemts
-
- vmull.s16 q10, d2, d29 @ scale second row first 4 elem
- vmull.s16 q11, d29, d3 @scale the second row last 4 elem
- vmull.s16 q12, d4, d30 @scale third row first 4 elem
-
- vst1.s32 {q8, q9}, [r2], r5 @ write the first row complete
-
- vmull.s16 q13, d30, d5 @scale the third row last 4 elem
- vmull.s16 q8, d6, d31 @scale the fourth row first 4 elem
-
-
- vst1.s32 {q10, q11}, [r2], r5 @store the second row complete
-
-@------------------------------- 3rd and 4th row
-
- vmull.s16 q9, d31, d7 @scale the fourth row second column
-
- vst1.s32 {q12, q13}, [r2], r5 @store the third row complete
-
- vmull.s16 q10, d8, d28 @scale the 5th row fisrst 4 elms
- vmull.s16 q11, d28, d9 @scale the 5th row second 4 elems
-
- vmull.s16 q12, d10, d29 @scale the 6th row first4 elements
-
-
- vst1.s32 {q8, q9}, [r2], r5 @store fifth row
-
-@--------------------------------5th and 6th row
-
- vmull.s16 q13, d29, d11 @scale 6th row sendond 4 elems
-
- vmull.s16 q8, d12, d30 @scale 7th rw first 4 elms
-
- vst1.s32 {q10, q11}, [r2], r5 @store 6th row second 4 elements
-
- vmull.s16 q9, d30, d13 @scale 7th rw second 4 elms
- vmull.s16 q10, d14, d31 @scale 8th rw forst 4 elms
-
-
- vst1.s32 {q12, q13}, [r2], r5 @store 6th row
-
-@----------------------------------7th and 8th row
- vmull.s16 q11, d31, d15 @scale 8th row second 4 elms
-
- vst1.s32 {q8, q9}, [r2], r5 @store 7th row
- vst1.s32 {q10, q11}, [r2], r5 @store 8th row
-
-@----------------------------------done writing
-
- pop {r4-r12, pc} @pop back all variables
-
-
-
-
-
-