summaryrefslogtreecommitdiffstats
path: root/libpixelflinger/t32cb16blend.S
diff options
context:
space:
mode:
authorMathias Agopian <mathias@google.com>2009-08-18 01:07:35 -0700
committerMathias Agopian <mathias@google.com>2009-08-18 14:34:51 -0700
commit9d881764173ce16badb6f1098ba5cf44b36f9aec (patch)
treeb55a74ac8ff756074db96676e6b281be33b89d1f /libpixelflinger/t32cb16blend.S
parent7f5b1a2b317874a9009d20ed4d3789a7cb508ca4 (diff)
downloadsystem_core-9d881764173ce16badb6f1098ba5cf44b36f9aec.tar.gz
system_core-9d881764173ce16badb6f1098ba5cf44b36f9aec.tar.bz2
system_core-9d881764173ce16badb6f1098ba5cf44b36f9aec.zip
fix part of [2017702] OpenGL bugs with alpha values of 1.0 in the source during blending into 8888 buffers
when ONE / ONE_MINUS_SRC_ALPHA blending mode was used, the code wasn't saturating the color component. the reason was that this mode is used for premltiplied alpha blending, however, if used with a non premultiplied source, the color component would wrap. unfortunately, this costs 6 extra cycles per pixels, however... "correctness" prevails. this should not impact the UI since it's using h/w acceleration most of the time it also doesn't impact games which should be using h/w GL. This change will slow the emulator down a bit.
Diffstat (limited to 'libpixelflinger/t32cb16blend.S')
-rw-r--r--libpixelflinger/t32cb16blend.S65
1 files changed, 48 insertions, 17 deletions
diff --git a/libpixelflinger/t32cb16blend.S b/libpixelflinger/t32cb16blend.S
index d4b257981..caf9eb7cd 100644
--- a/libpixelflinger/t32cb16blend.S
+++ b/libpixelflinger/t32cb16blend.S
@@ -21,53 +21,80 @@
.global scanline_t32cb16blend_arm
-// uses r6, r7, lr
-.macro pixel, DREG, SRC, FB, OFFSET
-
- // SRC = AARRGGBB
+/*
+ * .macro pixel
+ *
+ * \DREG is a 32-bit register containing *two* original destination RGB565
+ * pixels, with the even one in the low-16 bits, and the odd one in the
+ * high 16 bits.
+ *
+ * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
+ *
+ * \FB is a target register that will contain the blended pixel values.
+ *
+ * \ODD is either 0 or 1 and indicates if we're blending the lower or
+ * upper 16-bit pixels in DREG into FB
+ *
+ *
+ * clobbered: r6, r7, lr
+ *
+ */
+
+.macro pixel, DREG, SRC, FB, ODD
+
+ // SRC = 0xAABBGGRR
mov r7, \SRC, lsr #24 // sA
add r7, r7, r7, lsr #7 // sA + (sA >> 7)
rsb r7, r7, #0x100 // sA = 0x100 - (sA+(sA>>7))
1:
-.if \OFFSET
+.if \ODD
// red
- mov lr, \DREG, lsr #(\OFFSET + 6 + 5)
+ mov lr, \DREG, lsr #(16 + 11)
smulbb lr, r7, lr
mov r6, \SRC, lsr #3
and r6, r6, #0x1F
add lr, r6, lr, lsr #8
- orr \FB, lr, lsl #(\OFFSET + 11)
+ cmp lr, #0x1F
+ orrhs \FB, \FB, #(0x1F<<(16 + 11))
+ orrlo \FB, \FB, lr, lsl #(16 + 11)
// green
- and r6, \DREG, #(0x3F<<(\OFFSET + 5))
+ and r6, \DREG, #(0x3F<<(16 + 5))
smulbt r6, r7, r6
mov lr, \SRC, lsr #(8+2)
and lr, lr, #0x3F
add r6, lr, r6, lsr #(5+8)
- orr \FB, \FB, r6, lsl #(\OFFSET + 5)
+ cmp r6, #0x3F
+ orrhs \FB, \FB, #(0x3F<<(16 + 5))
+ orrlo \FB, \FB, r6, lsl #(16 + 5)
// blue
- and lr, \DREG, #(0x1F << \OFFSET)
+ and lr, \DREG, #(0x1F << 16)
smulbt lr, r7, lr
mov r6, \SRC, lsr #(8+8+3)
and r6, r6, #0x1F
add lr, r6, lr, lsr #8
- orr \FB, \FB, lr, lsl #\OFFSET
+ cmp lr, #0x1F
+ orrhs \FB, \FB, #(0x1F << 16)
+ orrlo \FB, \FB, lr, lsl #16
.else
// red
- mov lr, \DREG, lsr #(6+5)
+ mov lr, \DREG, lsr #11
and lr, lr, #0x1F
smulbb lr, r7, lr
mov r6, \SRC, lsr #3
and r6, r6, #0x1F
add lr, r6, lr, lsr #8
- mov \FB, lr, lsl #11
+ cmp lr, #0x1F
+ movhs \FB, #(0x1F<<11)
+ movlo \FB, lr, lsl #11
+
// green
and r6, \DREG, #(0x3F<<5)
@@ -75,7 +102,9 @@
mov lr, \SRC, lsr #(8+2)
and lr, lr, #0x3F
add r6, lr, r6, lsr #(5+8)
- orr \FB, \FB, r6, lsl #5
+ cmp r6, #0x3F
+ orrhs \FB, \FB, #(0x3F<<5)
+ orrlo \FB, \FB, r6, lsl #5
// blue
and lr, \DREG, #0x1F
@@ -83,7 +112,9 @@
mov r6, \SRC, lsr #(8+8+3)
and r6, r6, #0x1F
add lr, r6, lr, lsr #8
- orr \FB, \FB, lr
+ cmp lr, #0x1F
+ orrhs \FB, \FB, #0x1F
+ orrlo \FB, \FB, lr
.endif
@@ -128,7 +159,7 @@ aligned:
subs r2, r2, #2
blo 9f
- // The main loop is unrolled twice and process 4 pixels
+ // The main loop is unrolled twice and processes 4 pixels
8: ldmia r1!, {r4, r5}
// stream the source
pld [r1, #32]
@@ -142,7 +173,7 @@ aligned:
// stream the destination
pld [r0, #32]
pixel r3, r4, r12, 0
- pixel r3, r5, r12, 16
+ pixel r3, r5, r12, 1
// effectively, we're getting write-combining by virtue of the
// cpu's write-back cache.
str r12, [r0, #-4]