summaryrefslogtreecommitdiffstats
path: root/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
diff options
context:
space:
mode:
Diffstat (limited to 'decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s')
-rw-r--r--decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s523
1 files changed, 523 insertions, 0 deletions
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
new file mode 100644
index 0000000..485ee66
--- /dev/null
+++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s
@@ -0,0 +1,523 @@
+///*****************************************************************************
+//*
+//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
+//*
+//* Licensed under the Apache License, Version 2.0 (the "License");
+//* you may not use this file except in compliance with the License.
+//* You may obtain a copy of the License at:
+//*
+//* http://www.apache.org/licenses/LICENSE-2.0
+//*
+//* Unless required by applicable law or agreed to in writing, software
+//* distributed under the License is distributed on an "AS IS" BASIS,
+//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+//* See the License for the specific language governing permissions and
+//* limitations under the License.
+//*
+//*****************************************************************************/
+///**
+///*******************************************************************************
+//* //file
+//* ihevcd_fmt_conv_420sp_to_rgba8888.s
+//*
+//* //brief
+//* contains function definitions for format conversions
+//*
+//* //author
+//* ittiam
+//*
+//* //par list of functions:
+//*
+//*
+//* //remarks
+//* none
+//*
+//*******************************************************************************/
+
+ .equ DO1STROUNDING, 0
+
+ // ARM
+ //
+ // PRESERVE8
+
+.text
+.p2align 2
+
+.include "ihevc_neon_macros.s"
+
+
+
+///*****************************************************************************
+//* *
+//* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() *
+//* *
+//* Description : This function conversts the image from YUV422 color *
+//* space to RGB888 color space. The function can be *
+//* invoked at the MB level. *
+//* *
+//* Arguments : x0 pubY *
+//* x1 pubUV *
+//* x2 pusRGB *
+//* x3 pusRGB *
+//* [x13 #40] usHeight *
+//* [x13 #44] usWidth *
+//* [x13 #48] usStrideY *
+//* [x13 #52] usStrideU *
+//* [x13 #56] usStrideV *
+//* [x13 #60] usStrideRGB *
+//* *
+//* Values Returned : None *
+//* *
+//* Register Usage : x0 - x14 *
+//* *
+//* Stack Usage : 40 Bytes *
+//* *
+//* Interruptibility : Interruptible *
+//* *
+//* Known Limitations *
+//* Assumptions: Image Width: Assumed to be multiple of 16 and *
+//* greater than or equal to 16 *
+//* Image Height: Assumed to be even. *
+//* *
+//* Revision History : *
+//* DD MM YYYY Author(s) Changes (Describe the changes made) *
+//* 07 06 2010 Varshita Draft *
+//* 07 06 2010 Naveen Kr T Completed *
+//* 05 08 2013 Naveen K P Modified for HEVC *
+//*****************************************************************************/
+ .global ihevcd_fmt_conv_420sp_to_rgba8888_av8
+.type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function
+ihevcd_fmt_conv_420sp_to_rgba8888_av8:
+
+ //// push the registers on the stack
+ // STMFD sp!,{x4-x12,x14}
+ push_v_regs
+ stp x19, x20,[sp,#-16]!
+
+
+ ////x0 - Y PTR
+ ////x1 - UV PTR
+ ////x2 - RGB PTR
+ ////x3 - RGB PTR
+ ////x4 - PIC WIDTH
+ ////x5 - PIC HT
+ ////x6 - STRIDE Y
+ ////x7 - STRIDE U
+ ////x8 - STRIDE V
+ ////x9 - STRIDE RGB
+
+ ////ONE ROW PROCESSING AT A TIME
+
+ ////THE FOUR CONSTANTS ARE:
+ ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092
+
+ //PLD [x0]
+ //PLD [x1]
+ //PLD [x2]
+
+
+ ///* can be loaded from a defined const type */
+ mov x10,#0x3311
+ mov v0.4h[0], w10 ////C1
+
+ mov x10,#0xF379
+ mov v0.4h[1], w10 ////C2
+
+ mov x10,#0xE5F8
+ mov v0.4h[2], w10 ////C3
+
+ mov x10,#0x4092
+ mov v0.4h[3], w10 ////C4
+
+ ////LOAD CONSTANT 128 INTO A CORTEX REGISTER
+ MOV x10,#128
+ dup v1.8b,w10
+
+ ////D0 HAS C1-C2-C3-C4
+ //// load other parameters from stack
+ mov x9, x7
+ mov x7, x6
+ mov x6, x5
+ mov x5, x4
+ //LDR x4,[sp,#44]
+ //LDR x8,[sp,#52]
+
+ //// calculate offsets, offset = stride - width
+ SUB x10,x6,x3 //// luma offset
+ SUB x11,x7,x3
+ //, LSR #1 @// u offset
+ //SUB x12,x8,x3, LSR #1 @// v offset
+ SUB x14,x9,x3 //// rgb offset in pixels
+
+ //// calculate height loop count
+ LSR x5, x5, #1 //// height_cnt = height / 16
+
+ //// create next row pointers for rgb and luma data
+ ADD x7,x0,x6 //// luma_next_row = luma + luma_stride
+ ADD x8,x2,x9,LSL #2 //// rgb_next_row = rgb + rgb_stride
+
+LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP:
+
+ ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES.
+ LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF UV
+ ////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V
+
+ //// calculate width loop count
+ LSR x6, x3, #4 //// width_cnt = width / 16
+
+ ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+ ////LOAD VALUES OF Y 8-BIT VALUES
+ LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+ ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+ LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+ ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+ SUBS x6,x6,#1
+ BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP:
+ //VMOV.I8 Q1,#128
+ UZP1 v27.8b, v2.8b, v3.8b
+ UZP2 v3.8b, v2.8b, v3.8b
+ mov v2.d[0], v27.d[0]
+
+ ////NEED TO SUBTRACT (U-128) AND (V-128)
+ ////(D2-D1),(D3-D1)
+ uSUBL v4.8h, v2.8b, v1.8b ////(U-128)
+ uSUBL v6.8h, v3.8b, v1.8b ////(V-128)
+
+ ////LOAD VALUES OF U&V for next row
+ LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF U
+ ////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V
+
+ //PLD [x0]
+ prfm PLDL1KEEP,[x1]
+
+ ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+ sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+
+ sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
+ sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
+
+ sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G
+ sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
+ sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G
+ sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
+
+ ////NARROW RIGHT SHIFT BY 13 FOR R&B
+ sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ ////Q4 - WEIGHT FOR B
+
+ ////NARROW RIGHT SHIFT BY 13 FOR R&B
+ sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ ////Q5 - WEIGHT FOR R
+
+ ////NARROW RIGHT SHIFT BY 13 FOR G
+ sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ ////Q6 - WEIGHT FOR G
+
+ UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
+
+ UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
+
+ sqxtun v14.8b, v14.8h
+ sqxtun v15.8b, v18.8h
+ sqxtun v16.8b, v16.8h
+ movi v17.8b, #0
+
+ sqxtun v20.8b, v20.8h
+ sqxtun v21.8b, v24.8h
+ sqxtun v22.8b, v22.8h
+ movi v23.8b, #0
+
+ ZIP1 v27.8b, v14.8b, v15.8b
+ ZIP2 v15.8b, v14.8b, v15.8b
+ mov v14.d[0], v27.d[0]
+ ZIP1 v27.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.d[0], v27.d[0]
+
+ ZIP1 v27.8b, v20.8b, v21.8b
+ ZIP2 v21.8b, v20.8b, v21.8b
+ mov v20.d[0], v27.d[0]
+ ZIP1 v27.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.d[0], v27.d[0]
+
+ mov v14.d[1], v15.d[0]
+ mov v20.d[1], v21.d[0]
+ mov v16.d[1], v17.d[0]
+ mov v22.d[1], v23.d[0]
+
+ ZIP1 v27.8h, v14.8h, v16.8h
+ ZIP2 v26.8h, v14.8h, v16.8h
+
+ ZIP1 v25.8h, v20.8h, v22.8h
+ ZIP2 v19.8h, v20.8h, v22.8h
+
+ ZIP1 v14.4s, v27.4s, v25.4s
+ ZIP2 v20.4s, v27.4s, v25.4s
+
+ ZIP1 v16.4s, v26.4s, v19.4s
+ ZIP2 v22.4s, v26.4s, v19.4s
+
+ ST1 {v14.4s},[x2],#16
+ ST1 {v20.4s},[x2],#16
+ ST1 {v16.4s},[x2],#16
+ ST1 {v22.4s},[x2],#16
+
+ ////D14-D20 - TOALLY HAVE 16 VALUES
+ ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+ UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
+
+ UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
+
+ ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME
+ ////LOAD VALUES OF Y 8-BIT VALUES
+ LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1
+ ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+ LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2
+ ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15
+
+ prfm PLDL1KEEP,[x0]
+ prfm PLDL1KEEP,[x7]
+
+ sqxtun v14.8b, v14.8h
+ sqxtun v15.8b, v18.8h
+ sqxtun v16.8b, v16.8h
+ movi v17.8b, #0
+
+ sqxtun v20.8b, v20.8h
+ sqxtun v21.8b, v24.8h
+ sqxtun v22.8b, v22.8h
+ movi v23.8b, #0
+
+ ZIP1 v27.8b, v14.8b, v15.8b
+ ZIP2 v15.8b, v14.8b, v15.8b
+ mov v14.d[0], v27.d[0]
+ ZIP1 v27.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.d[0], v27.d[0]
+
+ ZIP1 v27.8b, v20.8b, v21.8b
+ ZIP2 v21.8b, v20.8b, v21.8b
+ mov v20.d[0], v27.d[0]
+ ZIP1 v27.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.d[0], v27.d[0]
+
+ mov v14.d[1], v15.d[0]
+ mov v20.d[1], v21.d[0]
+ mov v16.d[1], v17.d[0]
+ mov v22.d[1], v23.d[0]
+
+ ZIP1 v27.8h, v14.8h, v16.8h
+ ZIP2 v26.8h, v14.8h, v16.8h
+
+ ZIP1 v25.8h, v20.8h, v22.8h
+ ZIP2 v19.8h, v20.8h, v22.8h
+
+ ZIP1 v14.4s, v27.4s, v25.4s
+ ZIP2 v20.4s, v27.4s, v25.4s
+
+ ZIP1 v16.4s, v26.4s, v19.4s
+ ZIP2 v22.4s, v26.4s, v19.4s
+
+ ST1 {v14.4s},[x8],#16
+ ST1 {v20.4s},[x8],#16
+ ST1 {v16.4s},[x8],#16
+ ST1 {v22.4s},[x8],#16
+
+ SUBS x6,x6,#1 //// width_cnt -= 1
+ BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP
+
+LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP:
+ //VMOV.I8 Q1,#128
+ UZP1 v27.8b, v2.8b, v3.8b
+ UZP2 v3.8b, v2.8b, v3.8b
+ mov v2.d[0], v27.d[0]
+
+
+ ////NEED TO SUBTRACT (U-128) AND (V-128)
+ ////(D2-D1),(D3-D1)
+ uSUBL v4.8h, v2.8b, v1.8b ////(U-128)
+ uSUBL v6.8h, v3.8b, v1.8b ////(V-128)
+
+
+ ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS
+ sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B
+ sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B
+
+ sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R
+ sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R
+
+ sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G
+ sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3
+ sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G
+ sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3
+
+ ////NARROW RIGHT SHIFT BY 13 FOR R&B
+ sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES
+ sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES
+ ////Q4 - WEIGHT FOR B
+
+ ////NARROW RIGHT SHIFT BY 13 FOR R&B
+ sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES
+ sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES
+ ////Q5 - WEIGHT FOR R
+
+ ////NARROW RIGHT SHIFT BY 13 FOR G
+ sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES
+ ////Q6 - WEIGHT FOR G
+
+ UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R
+ UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G
+
+ UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R
+ UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G
+
+ sqxtun v14.8b, v14.8h
+ sqxtun v15.8b, v18.8h
+ sqxtun v16.8b, v16.8h
+ movi v17.8b, #0
+
+ sqxtun v20.8b, v20.8h
+ sqxtun v21.8b, v24.8h
+ sqxtun v22.8b, v22.8h
+ movi v23.8b, #0
+
+ ZIP1 v27.8b, v14.8b, v15.8b
+ ZIP2 v15.8b, v14.8b, v15.8b
+ mov v14.d[0], v27.d[0]
+ ZIP1 v27.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.d[0], v27.d[0]
+
+ ZIP1 v27.8b, v20.8b, v21.8b
+ ZIP2 v21.8b, v20.8b, v21.8b
+ mov v20.d[0], v27.d[0]
+ ZIP1 v27.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.d[0], v27.d[0]
+
+ mov v14.d[1], v15.d[0]
+ mov v20.d[1], v21.d[0]
+ mov v16.d[1], v17.d[0]
+ mov v22.d[1], v23.d[0]
+
+ ZIP1 v27.8h, v14.8h, v16.8h
+ ZIP2 v26.8h, v14.8h, v16.8h
+
+ ZIP1 v25.8h, v20.8h, v22.8h
+ ZIP2 v19.8h, v20.8h, v22.8h
+
+ ZIP1 v14.4s, v27.4s, v25.4s
+ ZIP2 v20.4s, v27.4s, v25.4s
+
+ ZIP1 v16.4s, v26.4s, v19.4s
+ ZIP2 v22.4s, v26.4s, v19.4s
+
+ ST1 {v14.4s},[x2],#16
+ ST1 {v20.4s},[x2],#16
+ ST1 {v16.4s},[x2],#16
+ ST1 {v22.4s},[x2],#16
+
+ ////D14-D20 - TOALLY HAVE 16 VALUES
+ ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS
+ UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B
+ UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R
+ UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G
+
+ UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B
+ UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R
+ UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G
+
+ sqxtun v14.8b, v14.8h
+ sqxtun v15.8b, v18.8h
+ sqxtun v16.8b, v16.8h
+ movi v17.8b, #0
+
+ sqxtun v20.8b, v20.8h
+ sqxtun v21.8b, v24.8h
+ sqxtun v22.8b, v22.8h
+ movi v23.8b, #0
+
+ ZIP1 v27.8b, v14.8b, v15.8b
+ ZIP2 v15.8b, v14.8b, v15.8b
+ mov v14.d[0], v27.d[0]
+ ZIP1 v27.8b, v16.8b, v17.8b
+ ZIP2 v17.8b, v16.8b, v17.8b
+ mov v16.d[0], v27.d[0]
+
+ ZIP1 v27.8b, v20.8b, v21.8b
+ ZIP2 v21.8b, v20.8b, v21.8b
+ mov v20.d[0], v27.d[0]
+ ZIP1 v27.8b, v22.8b, v23.8b
+ ZIP2 v23.8b, v22.8b, v23.8b
+ mov v22.d[0], v27.d[0]
+
+ mov v14.d[1], v15.d[0]
+ mov v20.d[1], v21.d[0]
+ mov v16.d[1], v17.d[0]
+ mov v22.d[1], v23.d[0]
+
+ ZIP1 v27.8h, v14.8h, v16.8h
+ ZIP2 v26.8h, v14.8h, v16.8h
+
+ ZIP1 v25.8h, v20.8h, v22.8h
+ ZIP2 v19.8h, v20.8h, v22.8h
+
+ ZIP1 v14.4s, v27.4s, v25.4s
+ ZIP2 v20.4s, v27.4s, v25.4s
+
+ ZIP1 v16.4s, v26.4s, v19.4s
+ ZIP2 v22.4s, v26.4s, v19.4s
+
+ ST1 {v14.4s},[x8],#16
+ ST1 {v20.4s},[x8],#16
+ ST1 {v16.4s},[x8],#16
+ ST1 {v22.4s},[x8],#16
+
+ //// Adjust the address pointers
+ ADD x0,x7,x10 //// luma = luma_next + offset
+ ADD x2,x8,x14,LSL #2 //// rgb = rgb_next + offset
+
+ ADD x7,x0,x3 //// luma_next = luma + width
+ ADD x8,x2,x3,LSL #2 //// rgb_next_row = rgb + width
+
+ ADD x1,x1,x11 //// adjust u pointer
+ //ADD x2,x2,x12 @// adjust v pointer
+
+ ADD x7,x7,x10 //// luma_next = luma + width + offset (because of register crunch)
+ ADD x8,x8,x14,LSL #2 //// rgb_next_row = rgb + width + offset
+
+ SUBS x5,x5,#1 //// height_cnt -= 1
+
+ BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP
+
+ ////POP THE REGISTERS
+ // LDMFD sp!,{x4-x12,PC}
+ ldp x19, x20,[sp],#16
+ pop_v_regs
+ ret
+
+
+
+
+ .section .note.GNU-stack,"",%progbits
+