/* libs/pixelflinger/t32cb16blend.S ** ** Copyright 2006, The Android Open Source Project ** ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** ** http://www.apache.org/licenses/LICENSE-2.0 ** ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. */ .text .syntax unified .balign 4 .global scanline_t32cb16blend_arm /* * .macro pixel * * \DREG is a 32-bit register containing *two* original destination RGB565 * pixels, with the even one in the low-16 bits, and the odd one in the * high 16 bits. * * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors. * * \FB is a target register that will contain the blended pixel values. * * \ODD is either 0 or 1 and indicates if we're blending the lower or * upper 16-bit pixels in DREG into FB * * * clobbered: r6, r7, lr * */ .macro pixel, DREG, SRC, FB, ODD // SRC = 0xAABBGGRR mov r7, \SRC, lsr #24 // sA add r7, r7, r7, lsr #7 // sA + (sA >> 7) rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7)) 1: .if \ODD // red mov lr, \DREG, lsr #(16 + 11) smulbb lr, r7, lr mov r6, \SRC, lsr #3 and r6, r6, #0x1F add lr, r6, lr, lsr #8 cmp lr, #0x1F orrhs \FB, \FB, #(0x1F<<(16 + 11)) orrlo \FB, \FB, lr, lsl #(16 + 11) // green and r6, \DREG, #(0x3F<<(16 + 5)) smulbt r6, r7, r6 mov lr, \SRC, lsr #(8+2) and lr, lr, #0x3F add r6, lr, r6, lsr #(5+8) cmp r6, #0x3F orrhs \FB, \FB, #(0x3F<<(16 + 5)) orrlo \FB, \FB, r6, lsl #(16 + 5) // blue and lr, \DREG, #(0x1F << 16) smulbt lr, r7, lr mov r6, \SRC, lsr #(8+8+3) and r6, r6, #0x1F add lr, r6, lr, lsr #8 cmp lr, #0x1F orrhs \FB, \FB, #(0x1F << 16) orrlo \FB, \FB, lr, lsl #16 .else // red mov lr, \DREG, lsr #11 and lr, lr, #0x1F smulbb lr, r7, lr mov r6, \SRC, lsr #3 and r6, r6, #0x1F add lr, r6, lr, lsr #8 cmp lr, #0x1F movhs \FB, #(0x1F<<11) movlo \FB, lr, lsl #11 // green and r6, \DREG, #(0x3F<<5) smulbb r6, r7, r6 mov lr, \SRC, lsr #(8+2) and lr, lr, #0x3F add r6, lr, r6, lsr #(5+8) cmp r6, #0x3F orrhs \FB, \FB, #(0x3F<<5) orrlo \FB, \FB, r6, lsl #5 // blue and lr, \DREG, #0x1F smulbb lr, r7, lr mov r6, \SRC, lsr #(8+8+3) and r6, r6, #0x1F add lr, r6, lr, lsr #8 cmp lr, #0x1F orrhs \FB, \FB, #0x1F orrlo \FB, \FB, lr .endif .endm // r0: dst ptr // r1: src ptr // r2: count // r3: d // r4: s0 // r5: s1 // r6: pixel // r7: pixel // r8: free // r9: free // r10: free // r11: free // r12: scratch // r14: pixel scanline_t32cb16blend_arm: stmfd sp!, {r4-r7, lr} pld [r0] pld [r1] // align DST to 32 bits tst r0, #0x3 beq aligned subs r2, r2, #1 ldmfdlo sp!, {r4-r7, lr} // return bxlo lr last: ldr r4, [r1], #4 ldrh r3, [r0] pixel r3, r4, r12, 0 strh r12, [r0], #2 aligned: subs r2, r2, #2 blo 9f // The main loop is unrolled twice and processes 4 pixels 8: ldmia r1!, {r4, r5} // stream the source pld [r1, #32] add r0, r0, #4 // it's all zero, skip this pixel orrs r3, r4, r5 beq 7f // load the destination ldr r3, [r0, #-4] // stream the destination pld [r0, #32] pixel r3, r4, r12, 0 pixel r3, r5, r12, 1 // effectively, we're getting write-combining by virtue of the // cpu's write-back cache. str r12, [r0, #-4] // 2nd iterration of the loop, don't stream anything subs r2, r2, #2 movlt r4, r5 blt 9f ldmia r1!, {r4, r5} add r0, r0, #4 orrs r3, r4, r5 beq 7f ldr r3, [r0, #-4] pixel r3, r4, r12, 0 pixel r3, r5, r12, 16 str r12, [r0, #-4] 7: subs r2, r2, #2 bhs 8b mov r4, r5 9: adds r2, r2, #1 ldmfdlo sp!, {r4-r7, lr} // return bxlo lr b last