libpixelflinger/col32cb16blend_neon.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

/* libs/pixelflinger/col32cb16blend_neon.S
 *
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


    .text
    .balign 4

    .global scanline_col32cb16blend_neon

//
// This function alpha blends a fixed color into a destination scanline, using
// the formula:
//
//     d = s + (((a + (a >> 7)) * d) >> 8)
//
// where d is the destination pixel,
//       s is the source color,
//       a is the alpha channel of the source color.
//
// The NEON implementation processes 16 pixels per iteration. The remaining 0 - 15
// pixels are processed in ARM code.
//

// r0 = destination buffer pointer
// r1 = color pointer
// r2 = count


scanline_col32cb16blend_neon:
    push        {r4-r11, lr}                    // stack ARM regs

    vmov.u16    q15, #256                       // create alpha constant
    movs        r3, r2, lsr #4                  // calc. sixteens iterations
    vmov.u16    q14, #0x1f                      // create blue mask

    beq         2f                              // if r3 == 0, branch to singles

    vld4.8      {d0[], d2[], d4[], d6[]}, [r1]  // load color into four registers
                                                //  split and duplicate them, such that
                                                //  d0 = 8 equal red values
                                                //  d2 = 8 equal green values
                                                //  d4 = 8 equal blue values
                                                //  d6 = 8 equal alpha values
    vshll.u8    q0, d0, #5                      // shift up red and widen
    vshll.u8    q1, d2, #6                      // shift up green and widen
    vshll.u8    q2, d4, #5                      // shift up blue and widen

    vshr.u8     d7, d6, #7                      // extract top bit of alpha
    vaddl.u8    q3, d6, d7                      // add top bit into alpha
    vsub.u16    q3, q15, q3                     // invert alpha

1:
    // This loop processes 16 pixels per iteration. In the comments, references to
    // the first eight pixels are suffixed with "0" (red0, green0, blue0), 
    // the second eight are suffixed "1".
                                                // q8  = dst red0
                                                // q9  = dst green0
                                                // q10 = dst blue0
                                                // q13 = dst red1
                                                // q12 = dst green1
                                                // q11 = dst blue1

    vld1.16     {d20, d21, d22, d23}, [r0]      // load 16 dest pixels
    vshr.u16    q8, q10, #11                    // shift dst red0 to low 5 bits
    pld         [r0, #63]                       // preload next dest pixels
    vshl.u16    q9, q10, #5                     // shift dst green0 to top 6 bits
    vand        q10, q10, q14                   // extract dst blue0
    vshr.u16    q9, q9, #10                     // shift dst green0 to low 6 bits
    vmul.u16    q8, q8, q3                      // multiply dst red0 by src alpha
    vshl.u16    q12, q11, #5                    // shift dst green1 to top 6 bits
    vmul.u16    q9, q9, q3                      // multiply dst green0 by src alpha
    vshr.u16    q13, q11, #11                   // shift dst red1 to low 5 bits
    vmul.u16    q10, q10, q3                    // multiply dst blue0 by src alpha
    vshr.u16    q12, q12, #10                   // shift dst green1 to low 6 bits
    vand        q11, q11, q14                   // extract dst blue1
    vadd.u16    q8, q8, q0                      // add src red to dst red0
    vmul.u16    q13, q13, q3                    // multiply dst red1 by src alpha
    vadd.u16    q9, q9, q1                      // add src green to dst green0 
    vmul.u16    q12, q12, q3                    // multiply dst green1 by src alpha
    vadd.u16    q10, q10, q2                    // add src blue to dst blue0
    vmul.u16    q11, q11, q3                    // multiply dst blue1 by src alpha
    vshr.u16    q8, q8, #8                      // shift down red0
    vadd.u16    q13, q13, q0                    // add src red to dst red1
    vshr.u16    q9, q9, #8                      // shift down green0
    vadd.u16    q12, q12, q1                    // add src green to dst green1
    vshr.u16    q10, q10, #8                    // shift down blue0
    vadd.u16    q11, q11, q2                    // add src blue to dst blue1
    vsli.u16    q10, q9, #5                     // shift & insert green0 into blue0
    vshr.u16    q13, q13, #8                    // shift down red1
    vsli.u16    q10, q8, #11                    // shift & insert red0 into blue0    
    vshr.u16    q12, q12, #8                    // shift down green1
    vshr.u16    q11, q11, #8                    // shift down blue1
    subs        r3, r3, #1                      // decrement loop counter
    vsli.u16    q11, q12, #5                    // shift & insert green1 into blue1
    vsli.u16    q11, q13, #11                   // shift & insert red1 into blue1

    vst1.16     {d20, d21, d22, d23}, [r0]!     // write 16 pixels back to dst
    bne         1b                              // if count != 0, loop

2:
    ands        r3, r2, #15                     // calc. single iterations 
    beq         4f                              // if r3 == 0, exit

    ldr         r4, [r1]                        // load source color
    mov         r5, r4, lsr #24                 // shift down alpha
    add         r5, r5, r5, lsr #7              // add in top bit
    rsb         r5, r5, #256                    // invert alpha
    and         r11, r4, #0xff                  // extract red
    ubfx        r12, r4, #8, #8                 // extract green
    ubfx        r4, r4, #16, #8                 // extract blue
    mov         r11, r11, lsl #5                // prescale red
    mov         r12, r12, lsl #6                // prescale green
    mov         r4, r4, lsl #5                  // prescale blue

3:
    ldrh        r8, [r0]                        // load dest pixel
    subs        r3, r3, #1                      // decrement loop counter
    mov         r6, r8, lsr #11                 // extract dest red
    ubfx        r7, r8, #5, #6                  // extract dest green
    and         r8, r8, #0x1f                   // extract dest blue

    smlabb      r6, r6, r5, r11                 // dest red * alpha + src red
    smlabb      r7, r7, r5, r12                 // dest green * alpha + src green
    smlabb      r8, r8, r5, r4                  // dest blue * alpha + src blue

    mov         r6, r6, lsr #8                  // shift down red
    mov         r7, r7, lsr #8                  // shift down green
    mov         r6, r6, lsl #11                 // shift red into 565
    orr         r6, r7, lsl #5                  // shift green into 565
    orr         r6, r8, lsr #8                  // shift blue into 565

    strh        r6, [r0], #2                    // store pixel to dest, update ptr
    bne         3b                              // if count != 0, loop
4:

    pop         {r4-r11, pc}                    // return