diff options
Diffstat (limited to 'libc/arch-x86/string/sse2-memchr-atom.S')
-rw-r--r-- | libc/arch-x86/string/sse2-memchr-atom.S | 556 |
1 files changed, 556 insertions, 0 deletions
diff --git a/libc/arch-x86/string/sse2-memchr-atom.S b/libc/arch-x86/string/sse2-memchr-atom.S new file mode 100644 index 000000000..013af9b66 --- /dev/null +++ b/libc/arch-x86/string/sse2-memchr-atom.S @@ -0,0 +1,556 @@ +/* +Copyright (c) 2011, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) pushl REG; CFI_PUSH (REG) +#define POP(REG) popl REG; CFI_POP (REG) + +#define ENTRANCE PUSH (%edi); +#define PARMS 8 +#define RETURN POP (%edi); ret; CFI_PUSH (%edi); + +#define STR1 PARMS +#define STR2 STR1+4 +#define LEN STR2+4 + + .text +ENTRY (memchr) + ENTRANCE + mov STR1(%esp), %ecx + movd STR2(%esp), %xmm1 + mov LEN(%esp), %edx + test %edx, %edx + jz L(return_null) + + punpcklbw %xmm1, %xmm1 + mov %ecx, %edi + punpcklbw %xmm1, %xmm1 + + and $63, %ecx + pshufd $0, %xmm1, %xmm1 + cmp $48, %ecx + ja L(crosscache) + + movdqu (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case2_prolog) + + sub $16, %edx + jbe L(return_null) + lea 16(%edi), %edi + and $15, %ecx + and $-16, %edi + add %ecx, %edx + sub $64, %edx + jbe L(exit_loop) + jmp L(loop_prolog) + + .p2align 4 +L(crosscache): + and $15, %ecx + and $-16, %edi + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + sar %cl, %eax + test %eax, %eax + + jnz L(match_case2_prolog1) + lea -16(%edx), %edx + add %ecx, %edx + jle L(return_null) + lea 16(%edi), %edi + sub $64, %edx + jbe L(exit_loop) + + .p2align 4 +L(loop_prolog): + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + + movdqa 48(%edi), %xmm4 + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + + lea 64(%edi), %edi + sub $64, %edx + jbe L(exit_loop) + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case1) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case1) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case1) + + movdqa 48(%edi), %xmm4 + pcmpeqb %xmm1, %xmm4 + lea 16(%ecx), %ecx + pmovmskb %xmm4, %eax + test %eax, %eax + jnz L(match_case1) + + lea 64(%edi), %edi + mov %edi, %ecx + and $-64, %edi + and $63, %ecx + add %ecx, %edx + + .p2align 4 +L(align64_loop): + sub $64, %edx + jbe L(exit_loop) + movdqa (%edi), %xmm0 + movdqa 16(%edi), %xmm2 + movdqa 32(%edi), %xmm3 + movdqa 48(%edi), %xmm4 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm1, %xmm4 + + pmaxub %xmm0, %xmm3 + pmaxub %xmm2, %xmm4 + pmaxub %xmm3, %xmm4 + add $64, %edi + pmovmskb %xmm4, %eax + + test %eax, %eax + jz L(align64_loop) + + sub $64, %edi + + pmovmskb %xmm0, %eax + xor %ecx, %ecx + test %eax, %eax + jnz L(match_case1) + + pmovmskb %xmm2, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + pmovmskb %xmm3, %eax + lea 16(%ecx), %ecx + test %eax, %eax + jnz L(match_case1) + + pcmpeqb 48(%edi), %xmm1 + pmovmskb %xmm1, %eax + lea 16(%ecx), %ecx + + .p2align 4 +L(match_case1): + add %ecx, %edi + test %al, %al + jz L(match_case1_high) + mov %al, %cl + and $15, %cl + jz L(match_case1_8) + test $0x01, %al + jnz L(exit_case1_1) + test $0x02, %al + jnz L(exit_case1_2) + test $0x04, %al + jnz L(exit_case1_3) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_case1_8): + test $0x10, %al + jnz L(exit_case1_5) + test $0x20, %al + jnz L(exit_case1_6) + test $0x40, %al + jnz L(exit_case1_7) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case1_high): + mov %ah, %ch + and $15, %ch + jz L(match_case1_high_8) + test $0x01, %ah + jnz L(exit_case1_9) + test $0x02, %ah + jnz L(exit_case1_10) + test $0x04, %ah + jnz L(exit_case1_11) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case1_high_8): + test $0x10, %ah + jnz L(exit_case1_13) + test $0x20, %ah + jnz L(exit_case1_14) + test $0x40, %ah + jnz L(exit_case1_15) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(exit_loop): + add $64, %edx + + movdqa (%edi), %xmm0 + pcmpeqb %xmm1, %xmm0 + xor %ecx, %ecx + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(match_case2) + cmp $16, %edx + jbe L(return_null) + + movdqa 16(%edi), %xmm2 + pcmpeqb %xmm1, %xmm2 + lea 16(%ecx), %ecx + pmovmskb %xmm2, %eax + test %eax, %eax + jnz L(match_case2) + cmp $32, %edx + jbe L(return_null) + + movdqa 32(%edi), %xmm3 + pcmpeqb %xmm1, %xmm3 + lea 16(%ecx), %ecx + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(match_case2) + cmp $48, %edx + jbe L(return_null) + + pcmpeqb 48(%edi), %xmm1 + lea 16(%ecx), %ecx + pmovmskb %xmm1, %eax + test %eax, %eax + jnz L(match_case2) + + xor %eax, %eax + RETURN + + .p2align 4 +L(exit_case1_1): + mov %edi, %eax + RETURN + + .p2align 4 +L(exit_case1_2): + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_3): + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_5): + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_6): + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_7): + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_9): + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_10): + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_11): + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_13): + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_14): + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(exit_case1_15): + lea 14(%edi), %eax + RETURN + + .p2align 4 +L(match_case2): + sub %ecx, %edx +L(match_case2_prolog1): + add %ecx, %edi +L(match_case2_prolog): + test %al, %al + jz L(match_case2_high) + mov %al, %cl + and $15, %cl + jz L(match_case2_8) + test $0x01, %al + jnz L(exit_case2_1) + test $0x02, %al + jnz L(exit_case2_2) + test $0x04, %al + jnz L(exit_case2_3) + sub $4, %edx + jb L(return_null) + lea 3(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_8): + test $0x10, %al + jnz L(exit_case2_5) + test $0x20, %al + jnz L(exit_case2_6) + test $0x40, %al + jnz L(exit_case2_7) + sub $8, %edx + jb L(return_null) + lea 7(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high): + mov %ah, %ch + and $15, %ch + jz L(match_case2_high_8) + test $0x01, %ah + jnz L(exit_case2_9) + test $0x02, %ah + jnz L(exit_case2_10) + test $0x04, %ah + jnz L(exit_case2_11) + sub $12, %edx + jb L(return_null) + lea 11(%edi), %eax + RETURN + + .p2align 4 +L(match_case2_high_8): + test $0x10, %ah + jnz L(exit_case2_13) + test $0x20, %ah + jnz L(exit_case2_14) + test $0x40, %ah + jnz L(exit_case2_15) + sub $16, %edx + jb L(return_null) + lea 15(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_1): + mov %edi, %eax + RETURN + + .p2align 4 +L(exit_case2_2): + sub $2, %edx + jb L(return_null) + lea 1(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_3): + sub $3, %edx + jb L(return_null) + lea 2(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_5): + sub $5, %edx + jb L(return_null) + lea 4(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_6): + sub $6, %edx + jb L(return_null) + lea 5(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_7): + sub $7, %edx + jb L(return_null) + lea 6(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_9): + sub $9, %edx + jb L(return_null) + lea 8(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_10): + sub $10, %edx + jb L(return_null) + lea 9(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_11): + sub $11, %edx + jb L(return_null) + lea 10(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_13): + sub $13, %edx + jb L(return_null) + lea 12(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_14): + sub $14, %edx + jb L(return_null) + lea 13(%edi), %eax + RETURN + + .p2align 4 +L(exit_case2_15): + sub $15, %edx + jb L(return_null) + lea 14(%edi), %eax + RETURN + .p2align 4 +L(return_null): + xor %eax, %eax + RETURN +END (memchr) |