diff options
Diffstat (limited to 'decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s')
-rw-r--r-- | decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s | 523 |
1 files changed, 523 insertions, 0 deletions
diff --git a/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s new file mode 100644 index 0000000..485ee66 --- /dev/null +++ b/decoder/arm64/ihevcd_fmt_conv_420sp_to_rgba8888.s @@ -0,0 +1,523 @@ +///***************************************************************************** +//* +//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore +//* +//* Licensed under the Apache License, Version 2.0 (the "License"); +//* you may not use this file except in compliance with the License. +//* You may obtain a copy of the License at: +//* +//* http://www.apache.org/licenses/LICENSE-2.0 +//* +//* Unless required by applicable law or agreed to in writing, software +//* distributed under the License is distributed on an "AS IS" BASIS, +//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +//* See the License for the specific language governing permissions and +//* limitations under the License. +//* +//*****************************************************************************/ +///** +///******************************************************************************* +//* //file +//* ihevcd_fmt_conv_420sp_to_rgba8888.s +//* +//* //brief +//* contains function definitions for format conversions +//* +//* //author +//* ittiam +//* +//* //par list of functions: +//* +//* +//* //remarks +//* none +//* +//*******************************************************************************/ + + .equ DO1STROUNDING, 0 + + // ARM + // + // PRESERVE8 + +.text +.p2align 2 + +.include "ihevc_neon_macros.s" + + + +///***************************************************************************** +//* * +//* Function Name : ihevcd_fmt_conv_420sp_to_rgba8888() * +//* * +//* Description : This function conversts the image from YUV422 color * +//* space to RGB888 color space. The function can be * +//* invoked at the MB level. * +//* * +//* Arguments : x0 pubY * +//* x1 pubUV * +//* x2 pusRGB * +//* x3 pusRGB * +//* [x13 #40] usHeight * +//* [x13 #44] usWidth * +//* [x13 #48] usStrideY * +//* [x13 #52] usStrideU * +//* [x13 #56] usStrideV * +//* [x13 #60] usStrideRGB * +//* * +//* Values Returned : None * +//* * +//* Register Usage : x0 - x14 * +//* * +//* Stack Usage : 40 Bytes * +//* * +//* Interruptibility : Interruptible * +//* * +//* Known Limitations * +//* Assumptions: Image Width: Assumed to be multiple of 16 and * +//* greater than or equal to 16 * +//* Image Height: Assumed to be even. * +//* * +//* Revision History : * +//* DD MM YYYY Author(s) Changes (Describe the changes made) * +//* 07 06 2010 Varshita Draft * +//* 07 06 2010 Naveen Kr T Completed * +//* 05 08 2013 Naveen K P Modified for HEVC * +//*****************************************************************************/ + .global ihevcd_fmt_conv_420sp_to_rgba8888_av8 +.type ihevcd_fmt_conv_420sp_to_rgba8888_av8, function +ihevcd_fmt_conv_420sp_to_rgba8888_av8: + + //// push the registers on the stack + // STMFD sp!,{x4-x12,x14} + push_v_regs + stp x19, x20,[sp,#-16]! + + + ////x0 - Y PTR + ////x1 - UV PTR + ////x2 - RGB PTR + ////x3 - RGB PTR + ////x4 - PIC WIDTH + ////x5 - PIC HT + ////x6 - STRIDE Y + ////x7 - STRIDE U + ////x8 - STRIDE V + ////x9 - STRIDE RGB + + ////ONE ROW PROCESSING AT A TIME + + ////THE FOUR CONSTANTS ARE: + ////C1=0x3311,C2=0xF379,C3=0xE5F8,C4=0x4092 + + //PLD [x0] + //PLD [x1] + //PLD [x2] + + + ///* can be loaded from a defined const type */ + mov x10,#0x3311 + mov v0.4h[0], w10 ////C1 + + mov x10,#0xF379 + mov v0.4h[1], w10 ////C2 + + mov x10,#0xE5F8 + mov v0.4h[2], w10 ////C3 + + mov x10,#0x4092 + mov v0.4h[3], w10 ////C4 + + ////LOAD CONSTANT 128 INTO A CORTEX REGISTER + MOV x10,#128 + dup v1.8b,w10 + + ////D0 HAS C1-C2-C3-C4 + //// load other parameters from stack + mov x9, x7 + mov x7, x6 + mov x6, x5 + mov x5, x4 + //LDR x4,[sp,#44] + //LDR x8,[sp,#52] + + //// calculate offsets, offset = stride - width + SUB x10,x6,x3 //// luma offset + SUB x11,x7,x3 + //, LSR #1 @// u offset + //SUB x12,x8,x3, LSR #1 @// v offset + SUB x14,x9,x3 //// rgb offset in pixels + + //// calculate height loop count + LSR x5, x5, #1 //// height_cnt = height / 16 + + //// create next row pointers for rgb and luma data + ADD x7,x0,x6 //// luma_next_row = luma + luma_stride + ADD x8,x2,x9,LSL #2 //// rgb_next_row = rgb + rgb_stride + +LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP: + + ////LOAD VALUES OF U&V AND COMPUTE THE R,G,B WEIGHT VALUES. + LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF UV + ////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V + + //// calculate width loop count + LSR x6, x3, #4 //// width_cnt = width / 16 + + ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME + ////LOAD VALUES OF Y 8-BIT VALUES + LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1 + ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 + LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2 + ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 + + SUBS x6,x6,#1 + BEQ LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP + +LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP: + //VMOV.I8 Q1,#128 + UZP1 v27.8b, v2.8b, v3.8b + UZP2 v3.8b, v2.8b, v3.8b + mov v2.d[0], v27.d[0] + + ////NEED TO SUBTRACT (U-128) AND (V-128) + ////(D2-D1),(D3-D1) + uSUBL v4.8h, v2.8b, v1.8b ////(U-128) + uSUBL v6.8h, v3.8b, v1.8b ////(V-128) + + ////LOAD VALUES OF U&V for next row + LD1 {v2.8b, v3.8b},[x1],#16 ////LOAD 8 VALUES OF U + ////VLD1.8 {D3},[x2]! @//LOAD 8 VALUES OF V + + //PLD [x0] + prfm PLDL1KEEP,[x1] + + ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS + sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B + + sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R + sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R + + sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G + sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3 + sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G + sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 + + ////NARROW RIGHT SHIFT BY 13 FOR R&B + sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES + ////Q4 - WEIGHT FOR B + + ////NARROW RIGHT SHIFT BY 13 FOR R&B + sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES + ////Q5 - WEIGHT FOR R + + ////NARROW RIGHT SHIFT BY 13 FOR G + sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES + sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES + ////Q6 - WEIGHT FOR G + + UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B + UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R + UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G + + UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B + UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R + UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G + + sqxtun v14.8b, v14.8h + sqxtun v15.8b, v18.8h + sqxtun v16.8b, v16.8h + movi v17.8b, #0 + + sqxtun v20.8b, v20.8h + sqxtun v21.8b, v24.8h + sqxtun v22.8b, v22.8h + movi v23.8b, #0 + + ZIP1 v27.8b, v14.8b, v15.8b + ZIP2 v15.8b, v14.8b, v15.8b + mov v14.d[0], v27.d[0] + ZIP1 v27.8b, v16.8b, v17.8b + ZIP2 v17.8b, v16.8b, v17.8b + mov v16.d[0], v27.d[0] + + ZIP1 v27.8b, v20.8b, v21.8b + ZIP2 v21.8b, v20.8b, v21.8b + mov v20.d[0], v27.d[0] + ZIP1 v27.8b, v22.8b, v23.8b + ZIP2 v23.8b, v22.8b, v23.8b + mov v22.d[0], v27.d[0] + + mov v14.d[1], v15.d[0] + mov v20.d[1], v21.d[0] + mov v16.d[1], v17.d[0] + mov v22.d[1], v23.d[0] + + ZIP1 v27.8h, v14.8h, v16.8h + ZIP2 v26.8h, v14.8h, v16.8h + + ZIP1 v25.8h, v20.8h, v22.8h + ZIP2 v19.8h, v20.8h, v22.8h + + ZIP1 v14.4s, v27.4s, v25.4s + ZIP2 v20.4s, v27.4s, v25.4s + + ZIP1 v16.4s, v26.4s, v19.4s + ZIP2 v22.4s, v26.4s, v19.4s + + ST1 {v14.4s},[x2],#16 + ST1 {v20.4s},[x2],#16 + ST1 {v16.4s},[x2],#16 + ST1 {v22.4s},[x2],#16 + + ////D14-D20 - TOALLY HAVE 16 VALUES + ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS + UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B + UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R + UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G + + UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B + UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R + UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G + + ////COMPUTE THE ACTUAL RGB VALUES,WE CAN DO TWO ROWS AT A TIME + ////LOAD VALUES OF Y 8-BIT VALUES + LD2 {v30.8b, v31.8b},[x0],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row 1 + ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 + LD2 {v28.8b, v29.8b},[x7],#16 ////D0 - Y0,Y2,Y4,Y6,Y8,Y10,Y12,Y14 row2 + ////D1 - Y1,Y3,Y5,Y7,Y9,Y11,Y13,Y15 + + prfm PLDL1KEEP,[x0] + prfm PLDL1KEEP,[x7] + + sqxtun v14.8b, v14.8h + sqxtun v15.8b, v18.8h + sqxtun v16.8b, v16.8h + movi v17.8b, #0 + + sqxtun v20.8b, v20.8h + sqxtun v21.8b, v24.8h + sqxtun v22.8b, v22.8h + movi v23.8b, #0 + + ZIP1 v27.8b, v14.8b, v15.8b + ZIP2 v15.8b, v14.8b, v15.8b + mov v14.d[0], v27.d[0] + ZIP1 v27.8b, v16.8b, v17.8b + ZIP2 v17.8b, v16.8b, v17.8b + mov v16.d[0], v27.d[0] + + ZIP1 v27.8b, v20.8b, v21.8b + ZIP2 v21.8b, v20.8b, v21.8b + mov v20.d[0], v27.d[0] + ZIP1 v27.8b, v22.8b, v23.8b + ZIP2 v23.8b, v22.8b, v23.8b + mov v22.d[0], v27.d[0] + + mov v14.d[1], v15.d[0] + mov v20.d[1], v21.d[0] + mov v16.d[1], v17.d[0] + mov v22.d[1], v23.d[0] + + ZIP1 v27.8h, v14.8h, v16.8h + ZIP2 v26.8h, v14.8h, v16.8h + + ZIP1 v25.8h, v20.8h, v22.8h + ZIP2 v19.8h, v20.8h, v22.8h + + ZIP1 v14.4s, v27.4s, v25.4s + ZIP2 v20.4s, v27.4s, v25.4s + + ZIP1 v16.4s, v26.4s, v19.4s + ZIP2 v22.4s, v26.4s, v19.4s + + ST1 {v14.4s},[x8],#16 + ST1 {v20.4s},[x8],#16 + ST1 {v16.4s},[x8],#16 + ST1 {v22.4s},[x8],#16 + + SUBS x6,x6,#1 //// width_cnt -= 1 + BNE LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP + +LABEL_YUV420SP_TO_RGB8888_WIDTH_LOOP_SKIP: + //VMOV.I8 Q1,#128 + UZP1 v27.8b, v2.8b, v3.8b + UZP2 v3.8b, v2.8b, v3.8b + mov v2.d[0], v27.d[0] + + + ////NEED TO SUBTRACT (U-128) AND (V-128) + ////(D2-D1),(D3-D1) + uSUBL v4.8h, v2.8b, v1.8b ////(U-128) + uSUBL v6.8h, v3.8b, v1.8b ////(V-128) + + + ////NEED TO MULTIPLY WITH Q2,Q3 WITH CO-EEFICIENTS + sMULL v8.4s, v4.4h, v0.4h[3] ////(U-128)*C4 FOR B + sMULL2 v10.4s, v4.8h, v0.4h[3] ////(U-128)*C4 FOR B + + sMULL v20.4s, v6.4h, v0.4h[0] ////(V-128)*C1 FOR R + sMULL2 v22.4s, v6.8h, v0.4h[0] ////(V-128)*C1 FOR R + + sMULL v12.4s, v4.4h, v0.4h[1] ////(U-128)*C2 FOR G + sMLAL v12.4s, v6.4h, v0.4h[2] ////Q6 = (U-128)*C2 + (V-128)*C3 + sMULL2 v14.4s, v4.8h, v0.4h[1] ////(U-128)*C2 FOR G + sMLAL2 v14.4s, v6.8h, v0.4h[2] ////Q7 = (U-128)*C2 + (V-128)*C3 + + ////NARROW RIGHT SHIFT BY 13 FOR R&B + sqshrn v8.4h, v8.4s,#13 ////D8 = (U-128)*C4>>13 4 16-BIT VALUES + sqshrn2 v8.8h, v10.4s,#13 ////D9 = (U-128)*C4>>13 4 16-BIT VALUES + ////Q4 - WEIGHT FOR B + + ////NARROW RIGHT SHIFT BY 13 FOR R&B + sqshrn v10.4h, v20.4s,#13 ////D10 = (V-128)*C1>>13 4 16-BIT VALUES + sqshrn2 v10.8h, v22.4s,#13 ////D11 = (V-128)*C1>>13 4 16-BIT VALUES + ////Q5 - WEIGHT FOR R + + ////NARROW RIGHT SHIFT BY 13 FOR G + sqshrn v12.4h, v12.4s,#13 ////D12 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES + sqshrn2 v12.8h, v14.4s,#13 ////D13 = [(U-128)*C2 + (V-128)*C3]>>13 4 16-BIT VALUES + ////Q6 - WEIGHT FOR G + + UADDW v14.8h, v8.8h , v30.8b ////Q7 - HAS Y + B + UADDW v16.8h, v10.8h , v30.8b ////Q8 - HAS Y + R + UADDW v18.8h, v12.8h , v30.8b ////Q9 - HAS Y + G + + UADDW v20.8h, v8.8h , v31.8b ////Q10 - HAS Y + B + UADDW v22.8h, v10.8h , v31.8b ////Q11 - HAS Y + R + UADDW v24.8h, v12.8h , v31.8b ////Q12 - HAS Y + G + + sqxtun v14.8b, v14.8h + sqxtun v15.8b, v18.8h + sqxtun v16.8b, v16.8h + movi v17.8b, #0 + + sqxtun v20.8b, v20.8h + sqxtun v21.8b, v24.8h + sqxtun v22.8b, v22.8h + movi v23.8b, #0 + + ZIP1 v27.8b, v14.8b, v15.8b + ZIP2 v15.8b, v14.8b, v15.8b + mov v14.d[0], v27.d[0] + ZIP1 v27.8b, v16.8b, v17.8b + ZIP2 v17.8b, v16.8b, v17.8b + mov v16.d[0], v27.d[0] + + ZIP1 v27.8b, v20.8b, v21.8b + ZIP2 v21.8b, v20.8b, v21.8b + mov v20.d[0], v27.d[0] + ZIP1 v27.8b, v22.8b, v23.8b + ZIP2 v23.8b, v22.8b, v23.8b + mov v22.d[0], v27.d[0] + + mov v14.d[1], v15.d[0] + mov v20.d[1], v21.d[0] + mov v16.d[1], v17.d[0] + mov v22.d[1], v23.d[0] + + ZIP1 v27.8h, v14.8h, v16.8h + ZIP2 v26.8h, v14.8h, v16.8h + + ZIP1 v25.8h, v20.8h, v22.8h + ZIP2 v19.8h, v20.8h, v22.8h + + ZIP1 v14.4s, v27.4s, v25.4s + ZIP2 v20.4s, v27.4s, v25.4s + + ZIP1 v16.4s, v26.4s, v19.4s + ZIP2 v22.4s, v26.4s, v19.4s + + ST1 {v14.4s},[x2],#16 + ST1 {v20.4s},[x2],#16 + ST1 {v16.4s},[x2],#16 + ST1 {v22.4s},[x2],#16 + + ////D14-D20 - TOALLY HAVE 16 VALUES + ////WE NEED TO SHIFT R,G,B VALUES TO GET 5BIT,6BIT AND 5BIT COMBINATIONS + UADDW v14.8h, v8.8h , v28.8b ////Q7 - HAS Y + B + UADDW v16.8h, v10.8h , v28.8b ////Q2 - HAS Y + R + UADDW v18.8h, v12.8h , v28.8b ////Q3 - HAS Y + G + + UADDW v20.8h, v8.8h , v29.8b ////Q10 - HAS Y + B + UADDW v22.8h, v10.8h , v29.8b ////Q11 - HAS Y + R + UADDW v24.8h, v12.8h , v29.8b ////Q12 - HAS Y + G + + sqxtun v14.8b, v14.8h + sqxtun v15.8b, v18.8h + sqxtun v16.8b, v16.8h + movi v17.8b, #0 + + sqxtun v20.8b, v20.8h + sqxtun v21.8b, v24.8h + sqxtun v22.8b, v22.8h + movi v23.8b, #0 + + ZIP1 v27.8b, v14.8b, v15.8b + ZIP2 v15.8b, v14.8b, v15.8b + mov v14.d[0], v27.d[0] + ZIP1 v27.8b, v16.8b, v17.8b + ZIP2 v17.8b, v16.8b, v17.8b + mov v16.d[0], v27.d[0] + + ZIP1 v27.8b, v20.8b, v21.8b + ZIP2 v21.8b, v20.8b, v21.8b + mov v20.d[0], v27.d[0] + ZIP1 v27.8b, v22.8b, v23.8b + ZIP2 v23.8b, v22.8b, v23.8b + mov v22.d[0], v27.d[0] + + mov v14.d[1], v15.d[0] + mov v20.d[1], v21.d[0] + mov v16.d[1], v17.d[0] + mov v22.d[1], v23.d[0] + + ZIP1 v27.8h, v14.8h, v16.8h + ZIP2 v26.8h, v14.8h, v16.8h + + ZIP1 v25.8h, v20.8h, v22.8h + ZIP2 v19.8h, v20.8h, v22.8h + + ZIP1 v14.4s, v27.4s, v25.4s + ZIP2 v20.4s, v27.4s, v25.4s + + ZIP1 v16.4s, v26.4s, v19.4s + ZIP2 v22.4s, v26.4s, v19.4s + + ST1 {v14.4s},[x8],#16 + ST1 {v20.4s},[x8],#16 + ST1 {v16.4s},[x8],#16 + ST1 {v22.4s},[x8],#16 + + //// Adjust the address pointers + ADD x0,x7,x10 //// luma = luma_next + offset + ADD x2,x8,x14,LSL #2 //// rgb = rgb_next + offset + + ADD x7,x0,x3 //// luma_next = luma + width + ADD x8,x2,x3,LSL #2 //// rgb_next_row = rgb + width + + ADD x1,x1,x11 //// adjust u pointer + //ADD x2,x2,x12 @// adjust v pointer + + ADD x7,x7,x10 //// luma_next = luma + width + offset (because of register crunch) + ADD x8,x8,x14,LSL #2 //// rgb_next_row = rgb + width + offset + + SUBS x5,x5,#1 //// height_cnt -= 1 + + BNE LABEL_YUV420SP_TO_RGB8888_HEIGHT_LOOP + + ////POP THE REGISTERS + // LDMFD sp!,{x4-x12,PC} + ldp x19, x20,[sp],#16 + pop_v_regs + ret + + + + + .section .note.GNU-stack,"",%progbits + |