summaryrefslogtreecommitdiffstats
path: root/vp8/common/arm/armv6/simpleloopfilter_v6.asm
diff options
context:
space:
mode:
Diffstat (limited to 'vp8/common/arm/armv6/simpleloopfilter_v6.asm')
-rw-r--r--vp8/common/arm/armv6/simpleloopfilter_v6.asm188
1 files changed, 77 insertions, 111 deletions
diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
index 15c6c7d..0137120 100644
--- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm
+++ b/vp8/common/arm/armv6/simpleloopfilter_v6.asm
@@ -1,10 +1,11 @@
;
-; Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
;
@@ -54,113 +55,87 @@ pstep RN r1
;stack const char *thresh,
;stack int count
-;Note: All 16 elements in flimit are equal. So, in the code, only one load is needed
-;for flimit. Same way applies to limit and thresh.
+; All 16 elements in flimit are equal. So, in the code, only one load is needed
+; for flimit. Same applies to limit. thresh is not used in simple looopfilter
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
|vp8_loop_filter_simple_horizontal_edge_armv6| PROC
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
- sub src, src, pstep, lsl #1 ; move src pointer down by 2 lines
-
- ldr r12, [r3], #4 ; limit
- ldr r3, [src], pstep ; p1
-
- ldr r9, [sp, #36] ; count for 8-in-parallel
- ldr r4, [src], pstep ; p0
-
- ldr r7, [r2], #4 ; flimit
- ldr r5, [src], pstep ; q0
+ ldr r12, [r3] ; limit
+ ldr r3, [src, -pstep, lsl #1] ; p1
+ ldr r4, [src, -pstep] ; p0
+ ldr r5, [src] ; q0
+ ldr r6, [src, pstep] ; q1
+ ldr r7, [r2] ; flimit
ldr r2, c0x80808080
-
- ldr r6, [src] ; q1
-
+ ldr r9, [sp, #40] ; count for 8-in-parallel
uadd8 r7, r7, r7 ; flimit * 2
- mov r9, r9, lsl #1 ; 4-in-parallel
+ mov r9, r9, lsl #1 ; double the count. we're doing 4 at a time
uadd8 r12, r7, r12 ; flimit * 2 + limit
+ mov lr, #0 ; need 0 in a couple places
|simple_hnext8|
- ; vp8_simple_filter_mask() function
+ ; vp8_simple_filter_mask()
uqsub8 r7, r3, r6 ; p1 - q1
uqsub8 r8, r6, r3 ; q1 - p1
uqsub8 r10, r4, r5 ; p0 - q0
uqsub8 r11, r5, r4 ; q0 - p0
orr r8, r8, r7 ; abs(p1 - q1)
- ldr lr, c0x7F7F7F7F ; 01111111 mask
orr r10, r10, r11 ; abs(p0 - q0)
- and r8, lr, r8, lsr #1 ; abs(p1 - q1) / 2
uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2
- mvn lr, #0 ; r10 == -1
+ uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1
uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
- ; STALL waiting on r10 :(
- uqsub8 r10, r10, r12 ; compare to flimit
- mov r8, #0
-
- usub8 r10, r8, r10 ; use usub8 instead of ssub8
- ; STALL (maybe?) when are flags set? :/
- sel r10, lr, r8 ; filter mask: lr
-
+ mvn r8, #0
+ usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags
+ sel r10, r8, lr ; filter mask: F or 0
cmp r10, #0
- beq simple_hskip_filter ; skip filtering
+ beq simple_hskip_filter ; skip filtering if all masks are 0x00
- ;vp8_simple_filter() function
+ ;vp8_simple_filter()
eor r3, r3, r2 ; p1 offset to convert to a signed value
eor r6, r6, r2 ; q1 offset to convert to a signed value
eor r4, r4, r2 ; p0 offset to convert to a signed value
eor r5, r5, r2 ; q0 offset to convert to a signed value
- qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
- qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
-
- qadd8 r3, r3, r6
- ldr r8, c0x03030303 ; r8 = 3
-
- qadd8 r3, r3, r6
+ qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
+ qsub8 r6, r5, r4 ; q0 - p0
+ qadd8 r3, r3, r6 ; += q0 - p0
ldr r7, c0x04040404
-
- qadd8 r3, r3, r6
- and r3, r3, lr ; vp8_filter &= mask;
-
- ;save bottom 3 bits so that we round one side +4 and the other +3
- qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
- qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
-
- mov r7, #0
- shadd8 r8 , r8 , r7 ; Filter2 >>= 3
- shadd8 r3 , r3 , r7 ; Filter1 >>= 3
- shadd8 r8 , r8 , r7
- shadd8 r3 , r3 , r7
- shadd8 r8 , r8 , r7 ; r8: Filter2
- shadd8 r3 , r3 , r7 ; r7: filter1
-
- ;calculate output
- sub src, src, pstep, lsl #1
-
- qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2)
- qsub8 r5 ,r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1)
- eor r4, r4, r2 ; *op0 = u^0x80
- str r4, [src], pstep ; store op0 result
+ qadd8 r3, r3, r6 ; += q0 - p0
+ ldr r8, c0x03030303
+ qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
+ ;STALL
+ and r3, r3, r10 ; vp8_filter &= mask
+
+ qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4
+ qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3
+
+ shadd8 r7 , r7 , lr
+ shadd8 r8 , r8 , lr
+ shadd8 r7 , r7 , lr
+ shadd8 r8 , r8 , lr
+ shadd8 r7 , r7 , lr ; Filter1 >>= 3
+ shadd8 r8 , r8 , lr ; Filter2 >>= 3
+
+ qsub8 r5 ,r5, r7 ; u = q0 - Filter1
+ qadd8 r4, r4, r8 ; u = p0 + Filter2
eor r5, r5, r2 ; *oq0 = u^0x80
- str r5, [src], pstep ; store oq0 result
+ str r5, [src] ; store oq0 result
+ eor r4, r4, r2 ; *op0 = u^0x80
+ str r4, [src, -pstep] ; store op0 result
|simple_hskip_filter|
- add src, src, #4
- sub src, src, pstep
- sub src, src, pstep, lsl #1
-
subs r9, r9, #1
+ addne src, src, #4 ; next row
- ;pld [src]
- ;pld [src, pstep]
- ;pld [src, pstep, lsl #1]
-
- ldrne r3, [src], pstep ; p1
- ldrne r4, [src], pstep ; p0
- ldrne r5, [src], pstep ; q0
- ldrne r6, [src] ; q1
+ ldrne r3, [src, -pstep, lsl #1] ; p1
+ ldrne r4, [src, -pstep] ; p0
+ ldrne r5, [src] ; q0
+ ldrne r6, [src, pstep] ; q1
bne simple_hnext8
@@ -173,9 +148,9 @@ pstep RN r1
;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
stmdb sp!, {r4 - r11, lr}
- ldr r12, [r2], #4 ; r12: flimit
+ ldr r12, [r2] ; r12: flimit
ldr r2, c0x80808080
- ldr r7, [r3], #4 ; limit
+ ldr r7, [r3] ; limit
; load soure data to r7, r8, r9, r10
ldrh r3, [src, #-2]
@@ -212,16 +187,14 @@ pstep RN r1
uqsub8 r10, r5, r4 ; q0 - p0
orr r7, r7, r8 ; abs(p1 - q1)
orr r9, r9, r10 ; abs(p0 - q0)
- ldr lr, c0x7F7F7F7F ; 0111 1111 mask
- uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
- and r7, lr, r7, lsr #1 ; abs(p1 - q1) / 2
mov r8, #0
+ uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2
+ uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2
uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2
mvn r10, #0 ; r10 == -1
- uqsub8 r7, r7, r12 ; compare to flimit
- usub8 r7, r8, r7
- sel r7, r10, r8 ; filter mask: lr
+ usub8 r7, r12, r7 ; compare to flimit
+ sel lr, r10, r8 ; filter mask
cmp lr, #0
beq simple_vskip_filter ; skip filtering
@@ -232,35 +205,34 @@ pstep RN r1
eor r4, r4, r2 ; p0 offset to convert to a signed value
eor r5, r5, r2 ; q0 offset to convert to a signed value
- qsub8 r3, r3, r6 ; vp8_filter (r3) = vp8_signed_char_clamp(p1-q1)
- qsub8 r6, r5, r4 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( q0 - p0))
+ qsub8 r3, r3, r6 ; vp8_filter = p1 - q1
+ qsub8 r6, r5, r4 ; q0 - p0
- qadd8 r3, r3, r6
- ldr r8, c0x03030303 ; r8 = 3
+ qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
+ ldr r9, c0x03030303 ; r9 = 3
- qadd8 r3, r3, r6
+ qadd8 r3, r3, r6 ; vp8_filter += q0 - p0
ldr r7, c0x04040404
- qadd8 r3, r3, r6
+ qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0))
+ ;STALL
and r3, r3, lr ; vp8_filter &= mask
- ;save bottom 3 bits so that we round one side +4 and the other +3
- qadd8 r8 , r3 , r8 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3)
- qadd8 r3 , r3 , r7 ; Filter1 (r3) = vp8_signed_char_clamp(vp8_filter+4)
+ qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3
+ qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4
- mov r7, #0
- shadd8 r8 , r8 , r7 ; Filter2 >>= 3
- shadd8 r3 , r3 , r7 ; Filter1 >>= 3
- shadd8 r8 , r8 , r7
- shadd8 r3 , r3 , r7
- shadd8 r8 , r8 , r7 ; r8: filter2
- shadd8 r3 , r3 , r7 ; r7: filter1
+ shadd8 r9 , r9 , r8
+ shadd8 r3 , r3 , r8
+ shadd8 r9 , r9 , r8
+ shadd8 r3 , r3 , r8
+ shadd8 r9 , r9 , r8 ; Filter2 >>= 3
+ shadd8 r3 , r3 , r8 ; Filter1 >>= 3
;calculate output
sub src, src, pstep, lsl #2
- qadd8 r4, r4, r8 ; u = vp8_signed_char_clamp(p0 + Filter2)
- qsub8 r5, r5, r3 ; u = vp8_signed_char_clamp(q0 - Filter1)
+ qadd8 r4, r4, r9 ; u = p0 + Filter2
+ qsub8 r5, r5, r3 ; u = q0 - Filter1
eor r4, r4, r2 ; *op0 = u^0x80
eor r5, r5, r2 ; *oq0 = u^0x80
@@ -285,10 +257,6 @@ pstep RN r1
|simple_vskip_filter|
subs r11, r11, #1
- ;pld [src]
- ;pld [src, pstep]
- ;pld [src, pstep, lsl #1]
-
; load soure data to r7, r8, r9, r10
ldrneh r3, [src, #-2]
ldrneh r4, [src], pstep
@@ -308,14 +276,12 @@ pstep RN r1
bne simple_vnext8
- ldmia sp!, {r4 - r12, pc}
+ ldmia sp!, {r4 - r11, pc}
ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6|
; Constant Pool
c0x80808080 DCD 0x80808080
c0x03030303 DCD 0x03030303
c0x04040404 DCD 0x04040404
-c0x01010101 DCD 0x01010101
-c0x7F7F7F7F DCD 0x7F7F7F7F
END