summaryrefslogtreecommitdiffstats
path: root/linux-x86_64/crypto/bn
diff options
context:
space:
mode:
authorAdam Langley <agl@google.com>2015-01-22 14:27:53 -0800
committerAdam Langley <agl@google.com>2015-01-30 16:52:14 -0800
commitd9e397b599b13d642138480a28c14db7a136bf05 (patch)
tree34bab61dc4ce323b123ad4614dbc07e86ea2f9ef /linux-x86_64/crypto/bn
downloadexternal_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.tar.gz
external_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.tar.bz2
external_boringssl-d9e397b599b13d642138480a28c14db7a136bf05.zip
Initial commit of BoringSSL for Android.
Diffstat (limited to 'linux-x86_64/crypto/bn')
-rw-r--r--linux-x86_64/crypto/bn/modexp512-x86_64.S1776
-rw-r--r--linux-x86_64/crypto/bn/rsaz-avx2.S34
-rw-r--r--linux-x86_64/crypto/bn/rsaz-x86_64.S1126
-rw-r--r--linux-x86_64/crypto/bn/x86_64-mont.S726
-rw-r--r--linux-x86_64/crypto/bn/x86_64-mont5.S1822
5 files changed, 5484 insertions, 0 deletions
diff --git a/linux-x86_64/crypto/bn/modexp512-x86_64.S b/linux-x86_64/crypto/bn/modexp512-x86_64.S
new file mode 100644
index 0000000..e49a2cb
--- /dev/null
+++ b/linux-x86_64/crypto/bn/modexp512-x86_64.S
@@ -0,0 +1,1776 @@
+#if defined(__x86_64__)
+.text
+
+.type MULADD_128x512,@function
+.align 16
+MULADD_128x512:
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,0(%rcx)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq 8(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ movq %r9,8(%rcx)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ .byte 0xf3,0xc3
+.size MULADD_128x512,.-MULADD_128x512
+.type mont_reduce,@function
+.align 16
+mont_reduce:
+ leaq 192(%rsp),%rdi
+ movq 32(%rsp),%rsi
+ addq $576,%rsi
+ leaq 520(%rsp),%rcx
+
+ movq 96(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ movq (%rcx),%r8
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,0(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ movq 8(%rcx),%r9
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ movq 16(%rcx),%r10
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ movq 24(%rcx),%r11
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ movq 32(%rcx),%r12
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ movq 40(%rcx),%r13
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ movq 48(%rcx),%r14
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ movq 56(%rcx),%r15
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq 104(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ movq %r9,8(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ movq 112(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %r10,16(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 120(%rcx),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,24(%rdi)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ xorq %rax,%rax
+
+ addq 64(%rcx),%r8
+ adcq 72(%rcx),%r9
+ adcq 80(%rcx),%r10
+ adcq 88(%rcx),%r11
+ adcq $0,%rax
+
+
+
+
+ movq %r8,64(%rdi)
+ movq %r9,72(%rdi)
+ movq %r10,%rbp
+ movq %r11,88(%rdi)
+
+ movq %rax,384(%rsp)
+
+ movq 0(%rdi),%r8
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+
+
+
+
+
+
+
+
+ addq $80,%rdi
+
+ addq $64,%rsi
+ leaq 296(%rsp),%rcx
+
+ call MULADD_128x512
+
+ movq 384(%rsp),%rax
+
+
+ addq -16(%rdi),%r8
+ adcq -8(%rdi),%r9
+ movq %r8,64(%rcx)
+ movq %r9,72(%rcx)
+
+ adcq %rax,%rax
+ movq %rax,384(%rsp)
+
+ leaq 192(%rsp),%rdi
+ addq $64,%rsi
+
+
+
+
+
+ movq (%rsi),%r8
+ movq 8(%rsi),%rbx
+
+ movq (%rcx),%rax
+ mulq %r8
+ movq %rax,%rbp
+ movq %rdx,%r9
+
+ movq 8(%rcx),%rax
+ mulq %r8
+ addq %rax,%r9
+
+ movq (%rcx),%rax
+ mulq %rbx
+ addq %rax,%r9
+
+ movq %r9,8(%rdi)
+
+
+ subq $192,%rsi
+
+ movq (%rcx),%r8
+ movq 8(%rcx),%r9
+
+ call MULADD_128x512
+
+
+
+
+ movq 0(%rsi),%rax
+ movq 8(%rsi),%rbx
+ movq 16(%rsi),%rdi
+ movq 24(%rsi),%rdx
+
+
+ movq 384(%rsp),%rbp
+
+ addq 64(%rcx),%r8
+ adcq 72(%rcx),%r9
+
+
+ adcq %rbp,%rbp
+
+
+
+ shlq $3,%rbp
+ movq 32(%rsp),%rcx
+ addq %rcx,%rbp
+
+
+ xorq %rsi,%rsi
+
+ addq 0(%rbp),%r10
+ adcq 64(%rbp),%r11
+ adcq 128(%rbp),%r12
+ adcq 192(%rbp),%r13
+ adcq 256(%rbp),%r14
+ adcq 320(%rbp),%r15
+ adcq 384(%rbp),%r8
+ adcq 448(%rbp),%r9
+
+
+
+ sbbq $0,%rsi
+
+
+ andq %rsi,%rax
+ andq %rsi,%rbx
+ andq %rsi,%rdi
+ andq %rsi,%rdx
+
+ movq $1,%rbp
+ subq %rax,%r10
+ sbbq %rbx,%r11
+ sbbq %rdi,%r12
+ sbbq %rdx,%r13
+
+
+
+
+ sbbq $0,%rbp
+
+
+
+ addq $512,%rcx
+ movq 32(%rcx),%rax
+ movq 40(%rcx),%rbx
+ movq 48(%rcx),%rdi
+ movq 56(%rcx),%rdx
+
+
+
+ andq %rsi,%rax
+ andq %rsi,%rbx
+ andq %rsi,%rdi
+ andq %rsi,%rdx
+
+
+
+ subq $1,%rbp
+
+ sbbq %rax,%r14
+ sbbq %rbx,%r15
+ sbbq %rdi,%r8
+ sbbq %rdx,%r9
+
+
+
+ movq 144(%rsp),%rsi
+ movq %r10,0(%rsi)
+ movq %r11,8(%rsi)
+ movq %r12,16(%rsi)
+ movq %r13,24(%rsi)
+ movq %r14,32(%rsi)
+ movq %r15,40(%rsi)
+ movq %r8,48(%rsi)
+ movq %r9,56(%rsi)
+
+ .byte 0xf3,0xc3
+.size mont_reduce,.-mont_reduce
+.type mont_mul_a3b,@function
+.align 16
+mont_mul_a3b:
+
+
+
+
+ movq 0(%rdi),%rbp
+
+ movq %r10,%rax
+ mulq %rbp
+ movq %rax,520(%rsp)
+ movq %rdx,%r10
+ movq %r11,%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq %r12,%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq %r13,%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq %r14,%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq %r15,%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq %r8,%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq %r9,%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %rdx,%r9
+ movq 8(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %r10,528(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%r10
+ movq 16(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,536(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq 24(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %r12,544(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq 32(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %r13,552(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq 40(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %r14,560(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq 48(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,568(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ addq %rbx,%r8
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq 56(%rdi),%rbp
+ movq 0(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r8
+ adcq $0,%rdx
+ movq %r8,576(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r9
+ adcq $0,%rdx
+ addq %rbx,%r9
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 16(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r10
+ adcq $0,%rdx
+ addq %rbx,%r10
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 24(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %rbx,%r11
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 32(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %rbx,%r12
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 40(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %rbx,%r13
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 48(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %rbx,%r14
+ adcq $0,%rdx
+ movq %rdx,%rbx
+
+ movq 56(%rsi),%rax
+ mulq %rbp
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %rbx,%r15
+ adcq $0,%rdx
+ movq %rdx,%r8
+ movq %r9,584(%rsp)
+ movq %r10,592(%rsp)
+ movq %r11,600(%rsp)
+ movq %r12,608(%rsp)
+ movq %r13,616(%rsp)
+ movq %r14,624(%rsp)
+ movq %r15,632(%rsp)
+ movq %r8,640(%rsp)
+
+
+
+
+
+ jmp mont_reduce
+
+
+.size mont_mul_a3b,.-mont_mul_a3b
+.type sqr_reduce,@function
+.align 16
+sqr_reduce:
+ movq 16(%rsp),%rcx
+
+
+
+ movq %r10,%rbx
+
+ movq %r11,%rax
+ mulq %rbx
+ movq %rax,528(%rsp)
+ movq %rdx,%r10
+ movq %r12,%rax
+ mulq %rbx
+ addq %rax,%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+ movq %r13,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %rdx,%r12
+ movq %r14,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %rdx,%r13
+ movq %r15,%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %rdx,%r14
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ movq %rdx,%r15
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %rdx,%rsi
+
+ movq %r10,536(%rsp)
+
+
+
+
+
+ movq 8(%rcx),%rbx
+
+ movq 16(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,544(%rsp)
+
+ movq %rdx,%r10
+ movq 24(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+ movq %r12,552(%rsp)
+
+ movq %rdx,%r10
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %r10,%r14
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+
+ movq %rdx,%r11
+
+
+
+
+ movq 16(%rcx),%rbx
+
+ movq 24(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r13
+ adcq $0,%rdx
+ movq %r13,560(%rsp)
+
+ movq %rdx,%r10
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r14
+ adcq $0,%rdx
+ addq %r10,%r14
+ adcq $0,%rdx
+ movq %r14,568(%rsp)
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%r12
+
+
+
+
+
+ movq 24(%rcx),%rbx
+
+ movq 32(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,576(%rsp)
+
+ movq %rdx,%r10
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%rsi
+ adcq $0,%rdx
+ addq %r10,%rsi
+ adcq $0,%rdx
+ movq %rsi,584(%rsp)
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+
+ movq %rdx,%r15
+
+
+
+
+ movq 32(%rcx),%rbx
+
+ movq 40(%rcx),%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ movq %r11,592(%rsp)
+
+ movq %rdx,%r10
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ addq %r10,%r12
+ adcq $0,%rdx
+ movq %r12,600(%rsp)
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r10,%r15
+ adcq $0,%rdx
+
+ movq %rdx,%r11
+
+
+
+
+ movq 40(%rcx),%rbx
+
+ movq %r8,%rax
+ mulq %rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ movq %r15,608(%rsp)
+
+ movq %rdx,%r10
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r11
+ adcq $0,%rdx
+ addq %r10,%r11
+ adcq $0,%rdx
+ movq %r11,616(%rsp)
+
+ movq %rdx,%r12
+
+
+
+
+ movq %r8,%rbx
+
+ movq %r9,%rax
+ mulq %rbx
+ addq %rax,%r12
+ adcq $0,%rdx
+ movq %r12,624(%rsp)
+
+ movq %rdx,632(%rsp)
+
+
+ movq 528(%rsp),%r10
+ movq 536(%rsp),%r11
+ movq 544(%rsp),%r12
+ movq 552(%rsp),%r13
+ movq 560(%rsp),%r14
+ movq 568(%rsp),%r15
+
+ movq 24(%rcx),%rax
+ mulq %rax
+ movq %rax,%rdi
+ movq %rdx,%r8
+
+ addq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq %r15,%r15
+ adcq $0,%r8
+
+ movq 0(%rcx),%rax
+ mulq %rax
+ movq %rax,520(%rsp)
+ movq %rdx,%rbx
+
+ movq 8(%rcx),%rax
+ mulq %rax
+
+ addq %rbx,%r10
+ adcq %rax,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%rbx
+ movq %r10,528(%rsp)
+ movq %r11,536(%rsp)
+
+ movq 16(%rcx),%rax
+ mulq %rax
+
+ addq %rbx,%r12
+ adcq %rax,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%rbx
+
+ movq %r12,544(%rsp)
+ movq %r13,552(%rsp)
+
+ xorq %rbp,%rbp
+ addq %rbx,%r14
+ adcq %rdi,%r15
+ adcq $0,%rbp
+
+ movq %r14,560(%rsp)
+ movq %r15,568(%rsp)
+
+
+
+
+ movq 576(%rsp),%r10
+ movq 584(%rsp),%r11
+ movq 592(%rsp),%r12
+ movq 600(%rsp),%r13
+ movq 608(%rsp),%r14
+ movq 616(%rsp),%r15
+ movq 624(%rsp),%rdi
+ movq 632(%rsp),%rsi
+
+ movq %r9,%rax
+ mulq %rax
+ movq %rax,%r9
+ movq %rdx,%rbx
+
+ addq %r10,%r10
+ adcq %r11,%r11
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+ adcq %r15,%r15
+ adcq %rdi,%rdi
+ adcq %rsi,%rsi
+ adcq $0,%rbx
+
+ addq %rbp,%r10
+
+ movq 32(%rcx),%rax
+ mulq %rax
+
+ addq %r8,%r10
+ adcq %rax,%r11
+ adcq $0,%rdx
+
+ movq %rdx,%rbp
+
+ movq %r10,576(%rsp)
+ movq %r11,584(%rsp)
+
+ movq 40(%rcx),%rax
+ mulq %rax
+
+ addq %rbp,%r12
+ adcq %rax,%r13
+ adcq $0,%rdx
+
+ movq %rdx,%rbp
+
+ movq %r12,592(%rsp)
+ movq %r13,600(%rsp)
+
+ movq 48(%rcx),%rax
+ mulq %rax
+
+ addq %rbp,%r14
+ adcq %rax,%r15
+ adcq $0,%rdx
+
+ movq %r14,608(%rsp)
+ movq %r15,616(%rsp)
+
+ addq %rdx,%rdi
+ adcq %r9,%rsi
+ adcq $0,%rbx
+
+ movq %rdi,624(%rsp)
+ movq %rsi,632(%rsp)
+ movq %rbx,640(%rsp)
+
+ jmp mont_reduce
+
+
+.size sqr_reduce,.-sqr_reduce
+.globl mod_exp_512
+.hidden mod_exp_512
+.type mod_exp_512,@function
+mod_exp_512:
+ pushq %rbp
+ pushq %rbx
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+
+ movq %rsp,%r8
+ subq $2688,%rsp
+ andq $-64,%rsp
+
+
+ movq %r8,0(%rsp)
+ movq %rdi,8(%rsp)
+ movq %rsi,16(%rsp)
+ movq %rcx,24(%rsp)
+.Lbody:
+
+
+
+ pxor %xmm4,%xmm4
+ movdqu 0(%rsi),%xmm0
+ movdqu 16(%rsi),%xmm1
+ movdqu 32(%rsi),%xmm2
+ movdqu 48(%rsi),%xmm3
+ movdqa %xmm4,512(%rsp)
+ movdqa %xmm4,528(%rsp)
+ movdqa %xmm4,608(%rsp)
+ movdqa %xmm4,624(%rsp)
+ movdqa %xmm0,544(%rsp)
+ movdqa %xmm1,560(%rsp)
+ movdqa %xmm2,576(%rsp)
+ movdqa %xmm3,592(%rsp)
+
+
+ movdqu 0(%rdx),%xmm0
+ movdqu 16(%rdx),%xmm1
+ movdqu 32(%rdx),%xmm2
+ movdqu 48(%rdx),%xmm3
+
+ leaq 384(%rsp),%rbx
+ movq %rbx,136(%rsp)
+ call mont_reduce
+
+
+ leaq 448(%rsp),%rcx
+ xorq %rax,%rax
+ movq %rax,0(%rcx)
+ movq %rax,8(%rcx)
+ movq %rax,24(%rcx)
+ movq %rax,32(%rcx)
+ movq %rax,40(%rcx)
+ movq %rax,48(%rcx)
+ movq %rax,56(%rcx)
+ movq %rax,128(%rsp)
+ movq $1,16(%rcx)
+
+ leaq 640(%rsp),%rbp
+ movq %rcx,%rsi
+ movq %rbp,%rdi
+ movq $8,%rax
+loop_0:
+ movq (%rcx),%rbx
+ movw %bx,(%rdi)
+ shrq $16,%rbx
+ movw %bx,64(%rdi)
+ shrq $16,%rbx
+ movw %bx,128(%rdi)
+ shrq $16,%rbx
+ movw %bx,192(%rdi)
+ leaq 8(%rcx),%rcx
+ leaq 256(%rdi),%rdi
+ decq %rax
+ jnz loop_0
+ movq $31,%rax
+ movq %rax,32(%rsp)
+ movq %rbp,40(%rsp)
+
+ movq %rsi,136(%rsp)
+ movq 0(%rsi),%r10
+ movq 8(%rsi),%r11
+ movq 16(%rsi),%r12
+ movq 24(%rsi),%r13
+ movq 32(%rsi),%r14
+ movq 40(%rsi),%r15
+ movq 48(%rsi),%r8
+ movq 56(%rsi),%r9
+init_loop:
+ leaq 384(%rsp),%rdi
+ call mont_mul_a3b
+ leaq 448(%rsp),%rsi
+ movq 40(%rsp),%rbp
+ addq $2,%rbp
+ movq %rbp,40(%rsp)
+ movq %rsi,%rcx
+ movq $8,%rax
+loop_1:
+ movq (%rcx),%rbx
+ movw %bx,(%rbp)
+ shrq $16,%rbx
+ movw %bx,64(%rbp)
+ shrq $16,%rbx
+ movw %bx,128(%rbp)
+ shrq $16,%rbx
+ movw %bx,192(%rbp)
+ leaq 8(%rcx),%rcx
+ leaq 256(%rbp),%rbp
+ decq %rax
+ jnz loop_1
+ movq 32(%rsp),%rax
+ subq $1,%rax
+ movq %rax,32(%rsp)
+ jne init_loop
+
+
+
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm1,80(%rsp)
+ movdqa %xmm2,96(%rsp)
+ movdqa %xmm3,112(%rsp)
+
+
+
+
+
+ movl 126(%rsp),%eax
+ movq %rax,%rdx
+ shrq $11,%rax
+ andl $2047,%edx
+ movl %edx,126(%rsp)
+ leaq 640(%rsp,%rax,2),%rsi
+ movq 8(%rsp),%rdx
+ movq $4,%rbp
+loop_2:
+ movzwq 192(%rsi),%rbx
+ movzwq 448(%rsi),%rax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 128(%rsi),%bx
+ movw 384(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 64(%rsi),%bx
+ movw 320(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 0(%rsi),%bx
+ movw 256(%rsi),%ax
+ movq %rbx,0(%rdx)
+ movq %rax,8(%rdx)
+ leaq 512(%rsi),%rsi
+ leaq 16(%rdx),%rdx
+ subq $1,%rbp
+ jnz loop_2
+ movq $505,48(%rsp)
+
+ movq 8(%rsp),%rcx
+ movq %rcx,136(%rsp)
+ movq 0(%rcx),%r10
+ movq 8(%rcx),%r11
+ movq 16(%rcx),%r12
+ movq 24(%rcx),%r13
+ movq 32(%rcx),%r14
+ movq 40(%rcx),%r15
+ movq 48(%rcx),%r8
+ movq 56(%rcx),%r9
+ jmp sqr_2
+
+main_loop_a3b:
+ call sqr_reduce
+ call sqr_reduce
+ call sqr_reduce
+sqr_2:
+ call sqr_reduce
+ call sqr_reduce
+
+
+
+ movq 48(%rsp),%rcx
+ movq %rcx,%rax
+ shrq $4,%rax
+ movl 64(%rsp,%rax,2),%edx
+ andq $15,%rcx
+ shrq %cl,%rdx
+ andq $31,%rdx
+
+ leaq 640(%rsp,%rdx,2),%rsi
+ leaq 448(%rsp),%rdx
+ movq %rdx,%rdi
+ movq $4,%rbp
+loop_3:
+ movzwq 192(%rsi),%rbx
+ movzwq 448(%rsi),%rax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 128(%rsi),%bx
+ movw 384(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 64(%rsi),%bx
+ movw 320(%rsi),%ax
+ shlq $16,%rbx
+ shlq $16,%rax
+ movw 0(%rsi),%bx
+ movw 256(%rsi),%ax
+ movq %rbx,0(%rdx)
+ movq %rax,8(%rdx)
+ leaq 512(%rsi),%rsi
+ leaq 16(%rdx),%rdx
+ subq $1,%rbp
+ jnz loop_3
+ movq 8(%rsp),%rsi
+ call mont_mul_a3b
+
+
+
+ movq 48(%rsp),%rcx
+ subq $5,%rcx
+ movq %rcx,48(%rsp)
+ jge main_loop_a3b
+
+
+
+end_main_loop_a3b:
+
+
+ movq 8(%rsp),%rdx
+ pxor %xmm4,%xmm4
+ movdqu 0(%rdx),%xmm0
+ movdqu 16(%rdx),%xmm1
+ movdqu 32(%rdx),%xmm2
+ movdqu 48(%rdx),%xmm3
+ movdqa %xmm4,576(%rsp)
+ movdqa %xmm4,592(%rsp)
+ movdqa %xmm4,608(%rsp)
+ movdqa %xmm4,624(%rsp)
+ movdqa %xmm0,512(%rsp)
+ movdqa %xmm1,528(%rsp)
+ movdqa %xmm2,544(%rsp)
+ movdqa %xmm3,560(%rsp)
+ call mont_reduce
+
+
+
+ movq 8(%rsp),%rax
+ movq 0(%rax),%r8
+ movq 8(%rax),%r9
+ movq 16(%rax),%r10
+ movq 24(%rax),%r11
+ movq 32(%rax),%r12
+ movq 40(%rax),%r13
+ movq 48(%rax),%r14
+ movq 56(%rax),%r15
+
+
+ movq 24(%rsp),%rbx
+ addq $512,%rbx
+
+ subq 0(%rbx),%r8
+ sbbq 8(%rbx),%r9
+ sbbq 16(%rbx),%r10
+ sbbq 24(%rbx),%r11
+ sbbq 32(%rbx),%r12
+ sbbq 40(%rbx),%r13
+ sbbq 48(%rbx),%r14
+ sbbq 56(%rbx),%r15
+
+
+ movq 0(%rax),%rsi
+ movq 8(%rax),%rdi
+ movq 16(%rax),%rcx
+ movq 24(%rax),%rdx
+ cmovncq %r8,%rsi
+ cmovncq %r9,%rdi
+ cmovncq %r10,%rcx
+ cmovncq %r11,%rdx
+ movq %rsi,0(%rax)
+ movq %rdi,8(%rax)
+ movq %rcx,16(%rax)
+ movq %rdx,24(%rax)
+
+ movq 32(%rax),%rsi
+ movq 40(%rax),%rdi
+ movq 48(%rax),%rcx
+ movq 56(%rax),%rdx
+ cmovncq %r12,%rsi
+ cmovncq %r13,%rdi
+ cmovncq %r14,%rcx
+ cmovncq %r15,%rdx
+ movq %rsi,32(%rax)
+ movq %rdi,40(%rax)
+ movq %rcx,48(%rax)
+ movq %rdx,56(%rax)
+
+ movq 0(%rsp),%rsi
+ movq 0(%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbx
+ movq 40(%rsi),%rbp
+ leaq 48(%rsi),%rsp
+.Lepilogue:
+ .byte 0xf3,0xc3
+.size mod_exp_512, . - mod_exp_512
+#endif
diff --git a/linux-x86_64/crypto/bn/rsaz-avx2.S b/linux-x86_64/crypto/bn/rsaz-avx2.S
new file mode 100644
index 0000000..cd334d9
--- /dev/null
+++ b/linux-x86_64/crypto/bn/rsaz-avx2.S
@@ -0,0 +1,34 @@
+#if defined(__x86_64__)
+.text
+
+.globl rsaz_avx2_eligible
+.hidden rsaz_avx2_eligible
+.type rsaz_avx2_eligible,@function
+rsaz_avx2_eligible:
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.globl rsaz_1024_sqr_avx2
+.hidden rsaz_1024_sqr_avx2
+.globl rsaz_1024_mul_avx2
+.hidden rsaz_1024_mul_avx2
+.globl rsaz_1024_norm2red_avx2
+.hidden rsaz_1024_norm2red_avx2
+.globl rsaz_1024_red2norm_avx2
+.hidden rsaz_1024_red2norm_avx2
+.globl rsaz_1024_scatter5_avx2
+.hidden rsaz_1024_scatter5_avx2
+.globl rsaz_1024_gather5_avx2
+.hidden rsaz_1024_gather5_avx2
+.type rsaz_1024_sqr_avx2,@function
+rsaz_1024_sqr_avx2:
+rsaz_1024_mul_avx2:
+rsaz_1024_norm2red_avx2:
+rsaz_1024_red2norm_avx2:
+rsaz_1024_scatter5_avx2:
+rsaz_1024_gather5_avx2:
+.byte 0x0f,0x0b
+ .byte 0xf3,0xc3
+.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+#endif
diff --git a/linux-x86_64/crypto/bn/rsaz-x86_64.S b/linux-x86_64/crypto/bn/rsaz-x86_64.S
new file mode 100644
index 0000000..fe0eed5
--- /dev/null
+++ b/linux-x86_64/crypto/bn/rsaz-x86_64.S
@@ -0,0 +1,1126 @@
+#if defined(__x86_64__)
+.text
+
+
+
+.globl rsaz_512_sqr
+.hidden rsaz_512_sqr
+.type rsaz_512_sqr,@function
+.align 32
+rsaz_512_sqr:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ subq $128+24,%rsp
+.Lsqr_body:
+ movq %rdx,%rbp
+ movq (%rsi),%rdx
+ movq 8(%rsi),%rax
+ movq %rcx,128(%rsp)
+ jmp .Loop_sqr
+
+.align 32
+.Loop_sqr:
+ movl %r8d,128+8(%rsp)
+
+ movq %rdx,%rbx
+ mulq %rdx
+ movq %rax,%r8
+ movq 16(%rsi),%rax
+ movq %rdx,%r9
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 24(%rsi),%rax
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 32(%rsi),%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 40(%rsi),%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 48(%rsi),%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 56(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ addq %rax,%r14
+ movq %rbx,%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ addq %r8,%r8
+ movq %r9,%rcx
+ adcq %r9,%r9
+
+ mulq %rax
+ movq %rax,(%rsp)
+ addq %rdx,%r8
+ adcq $0,%r9
+
+ movq %r8,8(%rsp)
+ shrq $63,%rcx
+
+
+ movq 8(%rsi),%r8
+ movq 16(%rsi),%rax
+ mulq %r8
+ addq %rax,%r10
+ movq 24(%rsi),%rax
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r11
+ movq 32(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r11
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r12
+ movq 40(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r12
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r13
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r13
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r14
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r14
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r8
+ addq %rax,%r15
+ movq %r8,%rax
+ adcq $0,%rdx
+ addq %rbx,%r15
+ movq %rdx,%r8
+ movq %r10,%rdx
+ adcq $0,%r8
+
+ addq %rdx,%rdx
+ leaq (%rcx,%r10,2),%r10
+ movq %r11,%rbx
+ adcq %r11,%r11
+
+ mulq %rax
+ addq %rax,%r9
+ adcq %rdx,%r10
+ adcq $0,%r11
+
+ movq %r9,16(%rsp)
+ movq %r10,24(%rsp)
+ shrq $63,%rbx
+
+
+ movq 16(%rsi),%r9
+ movq 24(%rsi),%rax
+ mulq %r9
+ addq %rax,%r12
+ movq 32(%rsi),%rax
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r9
+ addq %rax,%r13
+ movq 40(%rsi),%rax
+ adcq $0,%rdx
+ addq %rcx,%r13
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r9
+ addq %rax,%r14
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %rcx,%r14
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r9
+ movq %r12,%r10
+ leaq (%rbx,%r12,2),%r12
+ addq %rax,%r15
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %rcx,%r15
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r9
+ shrq $63,%r10
+ addq %rax,%r8
+ movq %r9,%rax
+ adcq $0,%rdx
+ addq %rcx,%r8
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ movq %r13,%rcx
+ leaq (%r10,%r13,2),%r13
+
+ mulq %rax
+ addq %rax,%r11
+ adcq %rdx,%r12
+ adcq $0,%r13
+
+ movq %r11,32(%rsp)
+ movq %r12,40(%rsp)
+ shrq $63,%rcx
+
+
+ movq 24(%rsi),%r10
+ movq 32(%rsi),%rax
+ mulq %r10
+ addq %rax,%r14
+ movq 40(%rsi),%rax
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r10
+ addq %rax,%r15
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r15
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r10
+ movq %r14,%r12
+ leaq (%rcx,%r14,2),%r14
+ addq %rax,%r8
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %rbx,%r8
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r10
+ shrq $63,%r12
+ addq %rax,%r9
+ movq %r10,%rax
+ adcq $0,%rdx
+ addq %rbx,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ movq %r15,%rbx
+ leaq (%r12,%r15,2),%r15
+
+ mulq %rax
+ addq %rax,%r13
+ adcq %rdx,%r14
+ adcq $0,%r15
+
+ movq %r13,48(%rsp)
+ movq %r14,56(%rsp)
+ shrq $63,%rbx
+
+
+ movq 32(%rsi),%r11
+ movq 40(%rsi),%rax
+ mulq %r11
+ addq %rax,%r8
+ movq 48(%rsi),%rax
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r11
+ addq %rax,%r9
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ movq %r8,%r12
+ leaq (%rbx,%r8,2),%r8
+ addq %rcx,%r9
+ movq %rdx,%rcx
+ adcq $0,%rcx
+
+ mulq %r11
+ shrq $63,%r12
+ addq %rax,%r10
+ movq %r11,%rax
+ adcq $0,%rdx
+ addq %rcx,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ movq %r9,%rcx
+ leaq (%r12,%r9,2),%r9
+
+ mulq %rax
+ addq %rax,%r15
+ adcq %rdx,%r8
+ adcq $0,%r9
+
+ movq %r15,64(%rsp)
+ movq %r8,72(%rsp)
+ shrq $63,%rcx
+
+
+ movq 40(%rsi),%r12
+ movq 48(%rsi),%rax
+ mulq %r12
+ addq %rax,%r10
+ movq 56(%rsi),%rax
+ movq %rdx,%rbx
+ adcq $0,%rbx
+
+ mulq %r12
+ addq %rax,%r11
+ movq %r12,%rax
+ movq %r10,%r15
+ leaq (%rcx,%r10,2),%r10
+ adcq $0,%rdx
+ shrq $63,%r15
+ addq %rbx,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ movq %r11,%rbx
+ leaq (%r15,%r11,2),%r11
+
+ mulq %rax
+ addq %rax,%r9
+ adcq %rdx,%r10
+ adcq $0,%r11
+
+ movq %r9,80(%rsp)
+ movq %r10,88(%rsp)
+
+
+ movq 48(%rsi),%r13
+ movq 56(%rsi),%rax
+ mulq %r13
+ addq %rax,%r12
+ movq %r13,%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ xorq %r14,%r14
+ shlq $1,%rbx
+ adcq %r12,%r12
+ adcq %r13,%r13
+ adcq %r14,%r14
+
+ mulq %rax
+ addq %rax,%r11
+ adcq %rdx,%r12
+ adcq $0,%r13
+
+ movq %r11,96(%rsp)
+ movq %r12,104(%rsp)
+
+
+ movq 56(%rsi),%rax
+ mulq %rax
+ addq %rax,%r13
+ adcq $0,%rdx
+
+ addq %rdx,%r14
+
+ movq %r13,112(%rsp)
+ movq %r14,120(%rsp)
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reduce
+
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ movq %r8,%rdx
+ movq %r9,%rax
+ movl 128+8(%rsp),%r8d
+ movq %rdi,%rsi
+
+ decl %r8d
+ jnz .Loop_sqr
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lsqr_epilogue:
+ .byte 0xf3,0xc3
+.size rsaz_512_sqr,.-rsaz_512_sqr
+.globl rsaz_512_mul
+.hidden rsaz_512_mul
+.type rsaz_512_mul,@function
+.align 32
+rsaz_512_mul:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ subq $128+24,%rsp
+.Lmul_body:
+.byte 102,72,15,110,199
+.byte 102,72,15,110,201
+ movq %r8,128(%rsp)
+ movq (%rdx),%rbx
+ movq %rdx,%rbp
+ call __rsaz_512_mul
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reduce
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size rsaz_512_mul,.-rsaz_512_mul
+.globl rsaz_512_mul_gather4
+.hidden rsaz_512_mul_gather4
+.type rsaz_512_mul_gather4,@function
+.align 32
+rsaz_512_mul_gather4:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ subq $128+24,%rsp
+.Lmul_gather4_body:
+ movl 64(%rdx,%r9,4),%eax
+.byte 102,72,15,110,199
+ movl (%rdx,%r9,4),%ebx
+.byte 102,72,15,110,201
+ movq %r8,128(%rsp)
+
+ shlq $32,%rax
+ orq %rax,%rbx
+ movq (%rsi),%rax
+ movq 8(%rsi),%rcx
+ leaq 128(%rdx,%r9,4),%rbp
+ mulq %rbx
+ movq %rax,(%rsp)
+ movq %rcx,%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ movd (%rbp),%xmm4
+ addq %rax,%r8
+ movq 16(%rsi),%rax
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ movd 64(%rbp),%xmm5
+ addq %rax,%r9
+ movq 24(%rsi),%rax
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ pslldq $4,%xmm5
+ addq %rax,%r10
+ movq 32(%rsi),%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ por %xmm5,%xmm4
+ addq %rax,%r11
+ movq 40(%rsi),%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 48(%rsi),%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ leaq 128(%rbp),%rbp
+ addq %rax,%r13
+ movq 56(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+.byte 102,72,15,126,227
+ addq %rax,%r14
+ movq (%rsi),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ leaq 8(%rsp),%rdi
+ movl $7,%ecx
+ jmp .Loop_mul_gather
+
+.align 32
+.Loop_mul_gather:
+ mulq %rbx
+ addq %rax,%r8
+ movq 8(%rsi),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ movd (%rbp),%xmm4
+ addq %rax,%r9
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ movd 64(%rbp),%xmm5
+ addq %rax,%r10
+ movq 24(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ pslldq $4,%xmm5
+ addq %rax,%r11
+ movq 32(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ por %xmm5,%xmm4
+ addq %rax,%r12
+ movq 40(%rsi),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+.byte 102,72,15,126,227
+ addq %rax,%r15
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ leaq 128(%rbp),%rbp
+ leaq 8(%rdi),%rdi
+
+ decl %ecx
+ jnz .Loop_mul_gather
+
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reduce
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lmul_gather4_epilogue:
+ .byte 0xf3,0xc3
+.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
+.globl rsaz_512_mul_scatter4
+.hidden rsaz_512_mul_scatter4
+.type rsaz_512_mul_scatter4,@function
+.align 32
+rsaz_512_mul_scatter4:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ subq $128+24,%rsp
+.Lmul_scatter4_body:
+ leaq (%r8,%r9,4),%r8
+.byte 102,72,15,110,199
+.byte 102,72,15,110,202
+.byte 102,73,15,110,208
+ movq %rcx,128(%rsp)
+
+ movq %rdi,%rbp
+ movq (%rdi),%rbx
+ call __rsaz_512_mul
+
+.byte 102,72,15,126,199
+.byte 102,72,15,126,205
+
+ movq (%rsp),%r8
+ movq 8(%rsp),%r9
+ movq 16(%rsp),%r10
+ movq 24(%rsp),%r11
+ movq 32(%rsp),%r12
+ movq 40(%rsp),%r13
+ movq 48(%rsp),%r14
+ movq 56(%rsp),%r15
+
+ call __rsaz_512_reduce
+ addq 64(%rsp),%r8
+ adcq 72(%rsp),%r9
+ adcq 80(%rsp),%r10
+ adcq 88(%rsp),%r11
+ adcq 96(%rsp),%r12
+ adcq 104(%rsp),%r13
+ adcq 112(%rsp),%r14
+ adcq 120(%rsp),%r15
+.byte 102,72,15,126,214
+ sbbq %rcx,%rcx
+
+ call __rsaz_512_subtract
+
+ movl %r8d,0(%rsi)
+ shrq $32,%r8
+ movl %r9d,128(%rsi)
+ shrq $32,%r9
+ movl %r10d,256(%rsi)
+ shrq $32,%r10
+ movl %r11d,384(%rsi)
+ shrq $32,%r11
+ movl %r12d,512(%rsi)
+ shrq $32,%r12
+ movl %r13d,640(%rsi)
+ shrq $32,%r13
+ movl %r14d,768(%rsi)
+ shrq $32,%r14
+ movl %r15d,896(%rsi)
+ shrq $32,%r15
+ movl %r8d,64(%rsi)
+ movl %r9d,192(%rsi)
+ movl %r10d,320(%rsi)
+ movl %r11d,448(%rsi)
+ movl %r12d,576(%rsi)
+ movl %r13d,704(%rsi)
+ movl %r14d,832(%rsi)
+ movl %r15d,960(%rsi)
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lmul_scatter4_epilogue:
+ .byte 0xf3,0xc3
+.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
+.globl rsaz_512_mul_by_one
+.hidden rsaz_512_mul_by_one
+.type rsaz_512_mul_by_one,@function
+.align 32
+rsaz_512_mul_by_one:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ subq $128+24,%rsp
+.Lmul_by_one_body:
+ movq %rdx,%rbp
+ movq %rcx,128(%rsp)
+
+ movq (%rsi),%r8
+ pxor %xmm0,%xmm0
+ movq 8(%rsi),%r9
+ movq 16(%rsi),%r10
+ movq 24(%rsi),%r11
+ movq 32(%rsi),%r12
+ movq 40(%rsi),%r13
+ movq 48(%rsi),%r14
+ movq 56(%rsi),%r15
+
+ movdqa %xmm0,(%rsp)
+ movdqa %xmm0,16(%rsp)
+ movdqa %xmm0,32(%rsp)
+ movdqa %xmm0,48(%rsp)
+ movdqa %xmm0,64(%rsp)
+ movdqa %xmm0,80(%rsp)
+ movdqa %xmm0,96(%rsp)
+ call __rsaz_512_reduce
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ leaq 128+24+48(%rsp),%rax
+ movq -48(%rax),%r15
+ movq -40(%rax),%r14
+ movq -32(%rax),%r13
+ movq -24(%rax),%r12
+ movq -16(%rax),%rbp
+ movq -8(%rax),%rbx
+ leaq (%rax),%rsp
+.Lmul_by_one_epilogue:
+ .byte 0xf3,0xc3
+.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
+.type __rsaz_512_reduce,@function
+.align 32
+__rsaz_512_reduce:
+ movq %r8,%rbx
+ imulq 128+8(%rsp),%rbx
+ movq 0(%rbp),%rax
+ movl $8,%ecx
+ jmp .Lreduction_loop
+
+.align 32
+.Lreduction_loop:
+ mulq %rbx
+ movq 8(%rbp),%rax
+ negq %r8
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq 128+8(%rsp),%rsi
+
+
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rbp),%rax
+ adcq $0,%rdx
+ imulq %r8,%rsi
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq %rsi,%rbx
+ addq %rax,%r15
+ movq 0(%rbp),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jne .Lreduction_loop
+
+ .byte 0xf3,0xc3
+.size __rsaz_512_reduce,.-__rsaz_512_reduce
+.type __rsaz_512_subtract,@function
+.align 32
+__rsaz_512_subtract:
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ movq 0(%rbp),%r8
+ movq 8(%rbp),%r9
+ negq %r8
+ notq %r9
+ andq %rcx,%r8
+ movq 16(%rbp),%r10
+ andq %rcx,%r9
+ notq %r10
+ movq 24(%rbp),%r11
+ andq %rcx,%r10
+ notq %r11
+ movq 32(%rbp),%r12
+ andq %rcx,%r11
+ notq %r12
+ movq 40(%rbp),%r13
+ andq %rcx,%r12
+ notq %r13
+ movq 48(%rbp),%r14
+ andq %rcx,%r13
+ notq %r14
+ movq 56(%rbp),%r15
+ andq %rcx,%r14
+ notq %r15
+ andq %rcx,%r15
+
+ addq (%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ .byte 0xf3,0xc3
+.size __rsaz_512_subtract,.-__rsaz_512_subtract
+.type __rsaz_512_mul,@function
+.align 32
+__rsaz_512_mul:
+ leaq 8(%rsp),%rdi
+
+ movq (%rsi),%rax
+ mulq %rbx
+ movq %rax,(%rdi)
+ movq 8(%rsi),%rax
+ movq %rdx,%r8
+
+ mulq %rbx
+ addq %rax,%r8
+ movq 16(%rsi),%rax
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 24(%rsi),%rax
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 32(%rsi),%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 40(%rsi),%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 48(%rsi),%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 56(%rsi),%rax
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ addq %rax,%r14
+ movq (%rsi),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ leaq 8(%rbp),%rbp
+ leaq 8(%rdi),%rdi
+
+ movl $7,%ecx
+ jmp .Loop_mul
+
+.align 32
+.Loop_mul:
+ movq (%rbp),%rbx
+ mulq %rbx
+ addq %rax,%r8
+ movq 8(%rsi),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 24(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 32(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 40(%rsi),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 48(%rsi),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 56(%rsi),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ leaq 8(%rbp),%rbp
+ adcq $0,%r14
+
+ mulq %rbx
+ addq %rax,%r15
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ leaq 8(%rdi),%rdi
+
+ decl %ecx
+ jnz .Loop_mul
+
+ movq %r8,(%rdi)
+ movq %r9,8(%rdi)
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+
+ .byte 0xf3,0xc3
+.size __rsaz_512_mul,.-__rsaz_512_mul
+.globl rsaz_512_scatter4
+.hidden rsaz_512_scatter4
+.type rsaz_512_scatter4,@function
+.align 16
+rsaz_512_scatter4:
+ leaq (%rdi,%rdx,4),%rdi
+ movl $8,%r9d
+ jmp .Loop_scatter
+.align 16
+.Loop_scatter:
+ movq (%rsi),%rax
+ leaq 8(%rsi),%rsi
+ movl %eax,(%rdi)
+ shrq $32,%rax
+ movl %eax,64(%rdi)
+ leaq 128(%rdi),%rdi
+ decl %r9d
+ jnz .Loop_scatter
+ .byte 0xf3,0xc3
+.size rsaz_512_scatter4,.-rsaz_512_scatter4
+
+.globl rsaz_512_gather4
+.hidden rsaz_512_gather4
+.type rsaz_512_gather4,@function
+.align 16
+rsaz_512_gather4:
+ leaq (%rsi,%rdx,4),%rsi
+ movl $8,%r9d
+ jmp .Loop_gather
+.align 16
+.Loop_gather:
+ movl (%rsi),%eax
+ movl 64(%rsi),%r8d
+ leaq 128(%rsi),%rsi
+ shlq $32,%r8
+ orq %r8,%rax
+ movq %rax,(%rdi)
+ leaq 8(%rdi),%rdi
+ decl %r9d
+ jnz .Loop_gather
+ .byte 0xf3,0xc3
+.size rsaz_512_gather4,.-rsaz_512_gather4
+#endif
diff --git a/linux-x86_64/crypto/bn/x86_64-mont.S b/linux-x86_64/crypto/bn/x86_64-mont.S
new file mode 100644
index 0000000..ebe7678
--- /dev/null
+++ b/linux-x86_64/crypto/bn/x86_64-mont.S
@@ -0,0 +1,726 @@
+#if defined(__x86_64__)
+.text
+
+
+
+.globl bn_mul_mont
+.hidden bn_mul_mont
+.type bn_mul_mont,@function
+.align 16
+bn_mul_mont:
+ testl $3,%r9d
+ jnz .Lmul_enter
+ cmpl $8,%r9d
+ jb .Lmul_enter
+ cmpq %rsi,%rdx
+ jne .Lmul4x_enter
+ testl $7,%r9d
+ jz .Lsqr8x_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 2(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul_body:
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ movq %r9,%r15
+.align 16
+.Lcopy:
+ movq (%rsp,%r14,8),%rsi
+ movq (%rdi,%r14,8),%rcx
+ xorq %rcx,%rsi
+ andq %rax,%rsi
+ xorq %rcx,%rsi
+ movq %r14,(%rsp,%r14,8)
+ movq %rsi,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont,.-bn_mul_mont
+.type bn_mul4x_mont,@function
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r9d
+ leaq 4(%r9),%r10
+ movq %rsp,%r11
+ negq %r10
+ leaq (%rsp,%r10,8),%rsp
+ andq $-1024,%rsp
+
+ movq %r11,8(%rsp,%r9,8)
+.Lmul4x_body:
+ movq %rdi,16(%rsp,%r9,8)
+ movq %rdx,%r12
+ movq (%r8),%r8
+ movq (%r12),%rbx
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .L1st4x
+.align 16
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ leaq 1(%r14),%r14
+.align 4
+.Louter4x:
+ movq (%r12,%r14,8),%rbx
+ xorq %r15,%r15
+ movq (%rsp),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%rsp),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 4(%r15),%r15
+ adcq $0,%rdx
+ movq %rdi,(%rsp)
+ movq %rdx,%r13
+ jmp .Linner4x
+.align 16
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq 8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 4(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq -16(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-32(%rsp,%r15,8)
+ movq %rdx,%r13
+ cmpq %r9,%r15
+ jb .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -16(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -16(%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%rsp,%r15,8)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -8(%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq -8(%rsp,%r15,8),%r11
+ adcq $0,%rdx
+ leaq 1(%r14),%r14
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%rsp,%r9,8),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%rsp,%r15,8)
+ movq %rdi,(%rsp,%r15,8)
+
+ cmpq %r9,%r14
+ jb .Louter4x
+ movq 16(%rsp,%r9,8),%rdi
+ movq 0(%rsp),%rax
+ movq 8(%rsp),%rdx
+ shrq $2,%r9
+ leaq (%rsp),%rsi
+ xorq %r14,%r14
+
+ subq 0(%rcx),%rax
+ movq 16(%rsi),%rbx
+ movq 24(%rsi),%rbp
+ sbbq 8(%rcx),%rdx
+ leaq -1(%r9),%r15
+ jmp .Lsub4x
+.align 16
+.Lsub4x:
+ movq %rax,0(%rdi,%r14,8)
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq 32(%rsi,%r14,8),%rax
+ movq 40(%rsi,%r14,8),%rdx
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+ movq %rbp,24(%rdi,%r14,8)
+ sbbq 32(%rcx,%r14,8),%rax
+ movq 48(%rsi,%r14,8),%rbx
+ movq 56(%rsi,%r14,8),%rbp
+ sbbq 40(%rcx,%r14,8),%rdx
+ leaq 4(%r14),%r14
+ decq %r15
+ jnz .Lsub4x
+
+ movq %rax,0(%rdi,%r14,8)
+ movq 32(%rsi,%r14,8),%rax
+ sbbq 16(%rcx,%r14,8),%rbx
+ movq %rdx,8(%rdi,%r14,8)
+ sbbq 24(%rcx,%r14,8),%rbp
+ movq %rbx,16(%rdi,%r14,8)
+
+ sbbq $0,%rax
+ movq %rax,%xmm0
+ punpcklqdq %xmm0,%xmm0
+ movq %rbp,24(%rdi,%r14,8)
+ xorq %r14,%r14
+
+ movq %r9,%r15
+ pxor %xmm5,%xmm5
+ jmp .Lcopy4x
+.align 16
+.Lcopy4x:
+ movdqu (%rsp,%r14,1),%xmm2
+ movdqu 16(%rsp,%r14,1),%xmm4
+ movdqu (%rdi,%r14,1),%xmm1
+ movdqu 16(%rdi,%r14,1),%xmm3
+ pxor %xmm1,%xmm2
+ pxor %xmm3,%xmm4
+ pand %xmm0,%xmm2
+ pand %xmm0,%xmm4
+ pxor %xmm1,%xmm2
+ pxor %xmm3,%xmm4
+ movdqu %xmm2,(%rdi,%r14,1)
+ movdqu %xmm4,16(%rdi,%r14,1)
+ movdqa %xmm5,(%rsp,%r14,1)
+ movdqa %xmm5,16(%rsp,%r14,1)
+
+ leaq 32(%r14),%r14
+ decq %r15
+ jnz .Lcopy4x
+
+ shlq $2,%r9
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq (%rsi),%r15
+ movq 8(%rsi),%r14
+ movq 16(%rsi),%r13
+ movq 24(%rsi),%r12
+ movq 32(%rsi),%rbp
+ movq 40(%rsi),%rbx
+ leaq 48(%rsi),%rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul4x_mont,.-bn_mul4x_mont
+
+
+.type bn_sqr8x_mont,@function
+.align 32
+bn_sqr8x_mont:
+.Lsqr8x_enter:
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shlq $3+2,%r10
+ negq %r9
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,4),%r11
+ movq (%r8),%r8
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lsqr8x_sp_alt
+ subq %r11,%rsp
+ leaq -64(%rsp,%r9,4),%rsp
+ jmp .Lsqr8x_sp_done
+
+.align 32
+.Lsqr8x_sp_alt:
+ leaq 4096-64(,%r9,4),%r10
+ leaq -64(%rsp,%r9,4),%rsp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rsp
+.Lsqr8x_sp_done:
+ andq $-64,%rsp
+ movq %r9,%r10
+ negq %r9
+
+ leaq 64(%rsp,%r9,2),%r11
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.Lsqr8x_body:
+
+ movq %r9,%rbp
+.byte 102,73,15,110,211
+ shrq $3+2,%rbp
+ movl OPENSSL_ia32cap_P+8(%rip),%eax
+ jmp .Lsqr8x_copy_n
+
+.align 32
+.Lsqr8x_copy_n:
+ movq 0(%rcx),%xmm0
+ movq 8(%rcx),%xmm1
+ movq 16(%rcx),%xmm3
+ movq 24(%rcx),%xmm4
+ leaq 32(%rcx),%rcx
+ movdqa %xmm0,0(%r11)
+ movdqa %xmm1,16(%r11)
+ movdqa %xmm3,32(%r11)
+ movdqa %xmm4,48(%r11)
+ leaq 64(%r11),%r11
+ decq %rbp
+ jnz .Lsqr8x_copy_n
+
+ pxor %xmm0,%xmm0
+.byte 102,72,15,110,207
+.byte 102,73,15,110,218
+ call bn_sqr8x_internal
+
+ pxor %xmm0,%xmm0
+ leaq 48(%rsp),%rax
+ leaq 64(%rsp,%r9,2),%rdx
+ shrq $3+2,%r9
+ movq 40(%rsp),%rsi
+ jmp .Lsqr8x_zero
+
+.align 32
+.Lsqr8x_zero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ movdqa %xmm0,32(%rax)
+ movdqa %xmm0,48(%rax)
+ leaq 64(%rax),%rax
+ movdqa %xmm0,0(%rdx)
+ movdqa %xmm0,16(%rdx)
+ movdqa %xmm0,32(%rdx)
+ movdqa %xmm0,48(%rdx)
+ leaq 64(%rdx),%rdx
+ decq %r9
+ jnz .Lsqr8x_zero
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+.Lsqr8x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_sqr8x_mont,.-bn_sqr8x_mont
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+.align 16
+#endif
diff --git a/linux-x86_64/crypto/bn/x86_64-mont5.S b/linux-x86_64/crypto/bn/x86_64-mont5.S
new file mode 100644
index 0000000..357bc11
--- /dev/null
+++ b/linux-x86_64/crypto/bn/x86_64-mont5.S
@@ -0,0 +1,1822 @@
+#if defined(__x86_64__)
+.text
+
+
+
+.globl bn_mul_mont_gather5
+.hidden bn_mul_mont_gather5
+.type bn_mul_mont_gather5,@function
+.align 64
+bn_mul_mont_gather5:
+ testl $7,%r9d
+ jnz .Lmul_enter
+ jmp .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+ movl %r9d,%r9d
+ movq %rsp,%rax
+ movl 8(%rsp),%r10d
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ leaq 2(%r9),%r11
+ negq %r11
+ leaq (%rsp,%r11,8),%rsp
+ andq $-1024,%rsp
+
+ movq %rax,8(%rsp,%r9,8)
+.Lmul_body:
+ movq %rdx,%r12
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%r12,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+.byte 102,72,15,126,195
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+
+ xorq %r14,%r14
+ xorq %r15,%r15
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .L1st_enter
+
+.align 16
+.L1st:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ movq %r10,%r11
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.L1st_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ leaq 1(%r15),%r15
+ movq %rdx,%r10
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .L1st
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+ movq %r10,%r11
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ jmp .Louter
+.align 16
+.Louter:
+ xorq %r15,%r15
+ movq %r8,%rbp
+ movq (%rsp),%r10
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+
+ imulq %r10,%rbp
+ movq %rdx,%r11
+
+ por %xmm2,%xmm0
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi),%rax
+ adcq $0,%rdx
+ movq 8(%rsp),%r10
+ movq %rdx,%r13
+
+ leaq 1(%r15),%r15
+ jmp .Linner_enter
+
+.align 16
+.Linner:
+ addq %rax,%r13
+ movq (%rsi,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+.Linner_enter:
+ mulq %rbx
+ addq %rax,%r11
+ movq (%rcx,%r15,8),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ leaq 1(%r15),%r15
+
+ mulq %rbp
+ cmpq %r9,%r15
+ jne .Linner
+
+.byte 102,72,15,126,195
+
+ addq %rax,%r13
+ movq (%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ movq (%rsp,%r15,8),%r10
+ adcq $0,%rdx
+ movq %r13,-16(%rsp,%r15,8)
+ movq %rdx,%r13
+
+ xorq %rdx,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%rsp,%r9,8)
+ movq %rdx,(%rsp,%r9,8)
+
+ leaq 1(%r14),%r14
+ cmpq %r9,%r14
+ jb .Louter
+
+ xorq %r14,%r14
+ movq (%rsp),%rax
+ leaq (%rsp),%rsi
+ movq %r9,%r15
+ jmp .Lsub
+.align 16
+.Lsub: sbbq (%rcx,%r14,8),%rax
+ movq %rax,(%rdi,%r14,8)
+ movq 8(%rsi,%r14,8),%rax
+ leaq 1(%r14),%r14
+ decq %r15
+ jnz .Lsub
+
+ sbbq $0,%rax
+ xorq %r14,%r14
+ movq %r9,%r15
+.align 16
+.Lcopy:
+ movq (%rsp,%r14,8),%rsi
+ movq (%rdi,%r14,8),%rcx
+ xorq %rcx,%rsi
+ andq %rax,%rsi
+ xorq %rcx,%rsi
+ movq %r14,(%rsp,%r14,8)
+ movq %rsi,(%rdi,%r14,8)
+ leaq 1(%r14),%r14
+ subq $1,%r15
+ jnz .Lcopy
+
+ movq 8(%rsp,%r9,8),%rsi
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+.Lmul_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul_mont_gather5,.-bn_mul_mont_gather5
+.type bn_mul4x_mont_gather5,@function
+.align 32
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+.byte 0x67
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+.byte 0x67
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shll $3+2,%r10d
+ negq %r9
+
+
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lmul4xsp_alt
+ subq %r11,%rsp
+ leaq -64(%rsp,%r9,2),%rsp
+ jmp .Lmul4xsp_done
+
+.align 32
+.Lmul4xsp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rsp
+.Lmul4xsp_done:
+ andq $-64,%rsp
+ negq %r9
+
+ movq %rax,40(%rsp)
+.Lmul4x_body:
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+.Lmul4x_epilogue:
+ .byte 0xf3,0xc3
+.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type mul4x_internal,@function
+.align 32
+mul4x_internal:
+ shlq $5,%r9
+ movl 8(%rax),%r10d
+ leaq 256(%rdx,%r9,1),%r13
+ shrq $5,%r9
+ movq %r10,%r11
+ shrq $3,%r10
+ andq $7,%r11
+ notq %r10
+ leaq .Lmagic_masks(%rip),%rax
+ andq $3,%r10
+ leaq 96(%rdx,%r11,8),%r12
+ movq 0(%rax,%r10,8),%xmm4
+ movq 8(%rax,%r10,8),%xmm5
+ addq $7,%r11
+ movq 16(%rax,%r10,8),%xmm6
+ movq 24(%rax,%r10,8),%xmm7
+ andq $7,%r11
+
+ movq -96(%r12),%xmm0
+ leaq 256(%r12),%r14
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+ pand %xmm6,%xmm2
+.byte 0x67
+ por %xmm1,%xmm0
+ movq -96(%r14),%xmm1
+.byte 0x67
+ pand %xmm7,%xmm3
+.byte 0x67
+ por %xmm2,%xmm0
+ movq -32(%r14),%xmm2
+.byte 0x67
+ pand %xmm4,%xmm1
+.byte 0x67
+ por %xmm3,%xmm0
+ movq 32(%r14),%xmm3
+
+.byte 102,72,15,126,195
+ movq 96(%r14),%xmm0
+ movq %r13,16+8(%rsp)
+ movq %rdi,56+8(%rsp)
+
+ movq (%r8),%r8
+ movq (%rsi),%rax
+ leaq (%rsi,%r9,1),%rsi
+ negq %r9
+
+ movq %r8,%rbp
+ mulq %rbx
+ movq %rax,%r10
+ movq (%rcx),%rax
+
+ pand %xmm5,%xmm2
+ pand %xmm6,%xmm3
+ por %xmm2,%xmm1
+
+ imulq %r10,%rbp
+
+
+
+
+
+
+
+ leaq 64+8(%rsp,%r11,8),%r14
+ movq %rdx,%r11
+
+ pand %xmm7,%xmm0
+ por %xmm3,%xmm1
+ leaq 512(%r12),%r12
+ por %xmm1,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 16(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 64(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+ jmp .L1st4x
+
+.align 32
+.L1st4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -32(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 16(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 64(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdi,(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz .L1st4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -32(%rcx),%rax
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+ leaq (%rcx,%r9,2),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ jmp .Louter4x
+
+.align 32
+.Louter4x:
+ movq (%r14,%r9,1),%r10
+ movq %r8,%rbp
+ mulq %rbx
+ addq %rax,%r10
+ movq (%rcx),%rax
+ adcq $0,%rdx
+
+ movq -96(%r12),%xmm0
+ movq -32(%r12),%xmm1
+ pand %xmm4,%xmm0
+ movq 32(%r12),%xmm2
+ pand %xmm5,%xmm1
+ movq 96(%r12),%xmm3
+
+ imulq %r10,%rbp
+.byte 0x67
+ movq %rdx,%r11
+ movq %rdi,(%r14)
+
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+ por %xmm2,%xmm0
+ leaq (%r14,%r9,1),%r14
+ leaq 256(%r12),%r12
+ por %xmm3,%xmm0
+
+ mulq %rbp
+ addq %rax,%r10
+ movq 8(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 16(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 32(%r9),%r15
+ leaq 64(%rcx),%rcx
+ adcq $0,%rdx
+ movq %rdx,%r13
+ jmp .Linner4x
+
+.align 32
+.Linner4x:
+ mulq %rbx
+ addq %rax,%r10
+ movq -32(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq -16(%rcx),%rax
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 0(%rcx),%rax
+ adcq $0,%rdx
+ addq (%r14),%r10
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq 8(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-16(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 16(%rcx),%rax
+ adcq $0,%rdx
+ addq 8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq 16(%rsi,%r15,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ leaq 64(%rcx),%rcx
+ adcq $0,%rdx
+ movq %r13,-8(%r14)
+ movq %rdx,%r13
+
+ addq $32,%r15
+ jnz .Linner4x
+
+ mulq %rbx
+ addq %rax,%r10
+ movq -32(%rcx),%rax
+ adcq $0,%rdx
+ addq 16(%r14),%r10
+ leaq 32(%r14),%r14
+ adcq $0,%rdx
+ movq %rdx,%r11
+
+ mulq %rbp
+ addq %rax,%r13
+ movq -8(%rsi),%rax
+ adcq $0,%rdx
+ addq %r10,%r13
+ adcq $0,%rdx
+ movq %rdi,-32(%r14)
+ movq %rdx,%rdi
+
+ mulq %rbx
+ addq %rax,%r11
+ movq %rbp,%rax
+ movq -16(%rcx),%rbp
+ adcq $0,%rdx
+ addq -8(%r14),%r11
+ adcq $0,%rdx
+ movq %rdx,%r10
+
+ mulq %rbp
+ addq %rax,%rdi
+ movq (%rsi,%r9,1),%rax
+ adcq $0,%rdx
+ addq %r11,%rdi
+ adcq $0,%rdx
+ movq %r13,-24(%r14)
+ movq %rdx,%r13
+
+.byte 102,72,15,126,195
+ movq %rdi,-16(%r14)
+ leaq (%rcx,%r9,2),%rcx
+
+ xorq %rdi,%rdi
+ addq %r10,%r13
+ adcq $0,%rdi
+ addq (%r14),%r13
+ adcq $0,%rdi
+ movq %r13,-8(%r14)
+
+ cmpq 16+8(%rsp),%r12
+ jb .Louter4x
+ subq %r13,%rbp
+ adcq %r15,%r15
+ orq %r15,%rdi
+ xorq $1,%rdi
+ leaq (%r14,%r9,1),%rbx
+ leaq (%rcx,%rdi,8),%rbp
+ movq %r9,%rcx
+ sarq $3+2,%rcx
+ movq 56+8(%rsp),%rdi
+ jmp .Lsqr4x_sub
+.size mul4x_internal,.-mul4x_internal
+.globl bn_power5
+.hidden bn_power5
+.type bn_power5,@function
+.align 32
+bn_power5:
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shll $3+2,%r10d
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lpwr_sp_alt
+ subq %r11,%rsp
+ leaq -64(%rsp,%r9,2),%rsp
+ jmp .Lpwr_sp_done
+
+.align 32
+.Lpwr_sp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rsp
+.Lpwr_sp_done:
+ andq $-64,%rsp
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.Lpower5_body:
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 102,73,15,110,218
+.byte 102,72,15,110,226
+
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+ call __bn_sqr8x_internal
+
+.byte 102,72,15,126,209
+.byte 102,72,15,126,226
+ movq %rsi,%rdi
+ movq 40(%rsp),%rax
+ leaq 32(%rsp),%r8
+
+ call mul4x_internal
+
+ movq 40(%rsp),%rsi
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+.Lpower5_epilogue:
+ .byte 0xf3,0xc3
+.size bn_power5,.-bn_power5
+
+.globl bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.hidden bn_sqr8x_internal
+.type bn_sqr8x_internal,@function
+.align 32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ leaq 32(%r10),%rbp
+ leaq (%rsi,%r9,1),%rsi
+
+ movq %r9,%rcx
+
+
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ movq %r10,-24(%rdi,%rbp,1)
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r11,-16(%rdi,%rbp,1)
+ movq %rdx,%r10
+
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ movq %rax,%r12
+ movq %rbx,%rax
+ movq %rdx,%r13
+
+ leaq (%rbp),%rcx
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+ jmp .Lsqr4x_1st
+
+.align 32
+.Lsqr4x_1st:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq 16(%rsi,%rcx,1),%rbx
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %r10,8(%rdi,%rcx,1)
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 24(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,16(%rdi,%rcx,1)
+ movq %rdx,%r13
+ adcq $0,%r13
+ leaq 32(%rcx),%rcx
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_1st
+
+ mulq %r15
+ addq %rax,%r13
+ leaq 16(%rbp),%rbp
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+ jmp .Lsqr4x_outer
+
+.align 32
+.Lsqr4x_outer:
+ movq -32(%rsi,%rbp,1),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi,%rbp,1),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi,%rbp,1),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ movq -24(%rdi,%rbp,1),%r10
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ movq %r10,-24(%rdi,%rbp,1)
+ movq %rdx,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -16(%rdi,%rbp,1),%r11
+ movq %rdx,%r10
+ adcq $0,%r10
+ movq %r11,-16(%rdi,%rbp,1)
+
+ xorq %r12,%r12
+
+ movq -8(%rsi,%rbp,1),%rbx
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq -8(%rdi,%rbp,1),%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rbp,1)
+
+ leaq (%rbp),%rcx
+ jmp .Lsqr4x_inner
+
+.align 32
+.Lsqr4x_inner:
+ movq (%rsi,%rcx,1),%rbx
+ mulq %r15
+ addq %rax,%r13
+ movq %rbx,%rax
+ movq %rdx,%r12
+ adcq $0,%r12
+ addq (%rdi,%rcx,1),%r13
+ adcq $0,%r12
+
+.byte 0x67
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq 8(%rsi,%rcx,1),%rbx
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %r11,(%rdi,%rcx,1)
+ movq %rbx,%rax
+ movq %rdx,%r13
+ adcq $0,%r13
+ addq 8(%rdi,%rcx,1),%r12
+ leaq 16(%rcx),%rcx
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ adcq $0,%rdx
+ addq %r12,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+ movq %r10,-8(%rdi,%rcx,1)
+
+ cmpq $0,%rcx
+ jne .Lsqr4x_inner
+
+.byte 0x67
+ mulq %r15
+ addq %rax,%r13
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ addq $16,%rbp
+ jnz .Lsqr4x_outer
+
+
+ movq -32(%rsi),%r14
+ leaq 48+8(%rsp,%r9,2),%rdi
+ movq -24(%rsi),%rax
+ leaq -32(%rdi,%rbp,1),%rdi
+ movq -16(%rsi),%rbx
+ movq %rax,%r15
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %r14
+ addq %rax,%r11
+ movq %rbx,%rax
+ movq %r10,-24(%rdi)
+ movq %rdx,%r10
+ adcq $0,%r10
+ addq %r13,%r11
+ movq -8(%rsi),%rbx
+ adcq $0,%r10
+
+ mulq %r15
+ addq %rax,%r12
+ movq %rbx,%rax
+ movq %r11,-16(%rdi)
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %r14
+ addq %rax,%r10
+ movq %rbx,%rax
+ movq %rdx,%r11
+ adcq $0,%r11
+ addq %r12,%r10
+ adcq $0,%r11
+ movq %r10,-8(%rdi)
+
+ mulq %r15
+ addq %rax,%r13
+ movq -16(%rsi),%rax
+ adcq $0,%rdx
+ addq %r11,%r13
+ adcq $0,%rdx
+
+ movq %r13,(%rdi)
+ movq %rdx,%r12
+ movq %rdx,8(%rdi)
+
+ mulq %rbx
+ addq $16,%rbp
+ xorq %r14,%r14
+ subq %r9,%rbp
+ xorq %r15,%r15
+
+ addq %r12,%rax
+ adcq $0,%rdx
+ movq %rax,8(%rdi)
+ movq %rdx,16(%rdi)
+ movq %r15,24(%rdi)
+
+ movq -16(%rsi,%rbp,1),%rax
+ leaq 48+8(%rsp),%rdi
+ xorq %r10,%r10
+ movq 8(%rdi),%r11
+
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ leaq 16(%rbp),%rbp
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ jmp .Lsqr4x_shift_n_add
+
+.align 32
+.Lsqr4x_shift_n_add:
+ leaq (%r14,%r10,2),%r12
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi,%rbp,1),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 0(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 8(%rdi),%r11
+ adcq %rax,%rbx
+ movq 0(%rsi,%rbp,1),%rax
+ movq %rbx,-16(%rdi)
+ adcq %rdx,%r8
+
+ leaq (%r14,%r10,2),%r12
+ movq %r8,-8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq 16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 24(%rdi),%r11
+ adcq %rax,%r12
+ movq 8(%rsi,%rbp,1),%rax
+ movq %r12,0(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,8(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ movq 32(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq 40(%rdi),%r11
+ adcq %rax,%rbx
+ movq 16(%rsi,%rbp,1),%rax
+ movq %rbx,16(%rdi)
+ adcq %rdx,%r8
+ movq %r8,24(%rdi)
+ sbbq %r15,%r15
+ leaq 64(%rdi),%rdi
+ addq $32,%rbp
+ jnz .Lsqr4x_shift_n_add
+
+ leaq (%r14,%r10,2),%r12
+.byte 0x67
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r13
+ shrq $63,%r11
+ orq %r10,%r13
+ movq -16(%rdi),%r10
+ movq %r11,%r14
+ mulq %rax
+ negq %r15
+ movq -8(%rdi),%r11
+ adcq %rax,%r12
+ movq -8(%rsi),%rax
+ movq %r12,-32(%rdi)
+ adcq %rdx,%r13
+
+ leaq (%r14,%r10,2),%rbx
+ movq %r13,-24(%rdi)
+ sbbq %r15,%r15
+ shrq $63,%r10
+ leaq (%rcx,%r11,2),%r8
+ shrq $63,%r11
+ orq %r10,%r8
+ mulq %rax
+ negq %r15
+ adcq %rax,%rbx
+ adcq %rdx,%r8
+ movq %rbx,-16(%rdi)
+ movq %r8,-8(%rdi)
+.byte 102,72,15,126,213
+sqr8x_reduction:
+ xorq %rax,%rax
+ leaq (%rbp,%r9,2),%rcx
+ leaq 48+8(%rsp,%r9,2),%rdx
+ movq %rcx,0+8(%rsp)
+ leaq 48+8(%rsp,%r9,1),%rdi
+ movq %rdx,8+8(%rsp)
+ negq %r9
+ jmp .L8x_reduction_loop
+
+.align 32
+.L8x_reduction_loop:
+ leaq (%rdi,%r9,1),%rdi
+.byte 0x66
+ movq 0(%rdi),%rbx
+ movq 8(%rdi),%r9
+ movq 16(%rdi),%r10
+ movq 24(%rdi),%r11
+ movq 32(%rdi),%r12
+ movq 40(%rdi),%r13
+ movq 48(%rdi),%r14
+ movq 56(%rdi),%r15
+ movq %rax,(%rdx)
+ leaq 64(%rdi),%rdi
+
+.byte 0x67
+ movq %rbx,%r8
+ imulq 32+8(%rsp),%rbx
+ movq 0(%rbp),%rax
+ movl $8,%ecx
+ jmp .L8x_reduce
+
+.align 32
+.L8x_reduce:
+ mulq %rbx
+ movq 16(%rbp),%rax
+ negq %r8
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ movq %rbx,48-8+8(%rsp,%rcx,8)
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq 32+8(%rsp),%rsi
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 64(%rbp),%rax
+ adcq $0,%rdx
+ imulq %r8,%rsi
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 80(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 96(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 112(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq %rsi,%rbx
+ addq %rax,%r15
+ movq 0(%rbp),%rax
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz .L8x_reduce
+
+ leaq 128(%rbp),%rbp
+ xorq %rax,%rax
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae .L8x_no_tail
+
+.byte 0x66
+ addq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movq 48+56+8(%rsp),%rbx
+ movl $8,%ecx
+ movq 0(%rbp),%rax
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail:
+ mulq %rbx
+ addq %rax,%r8
+ movq 16(%rbp),%rax
+ movq %r8,(%rdi)
+ movq %rdx,%r8
+ adcq $0,%r8
+
+ mulq %rbx
+ addq %rax,%r9
+ movq 32(%rbp),%rax
+ adcq $0,%rdx
+ addq %r9,%r8
+ leaq 8(%rdi),%rdi
+ movq %rdx,%r9
+ adcq $0,%r9
+
+ mulq %rbx
+ addq %rax,%r10
+ movq 48(%rbp),%rax
+ adcq $0,%rdx
+ addq %r10,%r9
+ movq %rdx,%r10
+ adcq $0,%r10
+
+ mulq %rbx
+ addq %rax,%r11
+ movq 64(%rbp),%rax
+ adcq $0,%rdx
+ addq %r11,%r10
+ movq %rdx,%r11
+ adcq $0,%r11
+
+ mulq %rbx
+ addq %rax,%r12
+ movq 80(%rbp),%rax
+ adcq $0,%rdx
+ addq %r12,%r11
+ movq %rdx,%r12
+ adcq $0,%r12
+
+ mulq %rbx
+ addq %rax,%r13
+ movq 96(%rbp),%rax
+ adcq $0,%rdx
+ addq %r13,%r12
+ movq %rdx,%r13
+ adcq $0,%r13
+
+ mulq %rbx
+ addq %rax,%r14
+ movq 112(%rbp),%rax
+ adcq $0,%rdx
+ addq %r14,%r13
+ movq %rdx,%r14
+ adcq $0,%r14
+
+ mulq %rbx
+ movq 48-16+8(%rsp,%rcx,8),%rbx
+ addq %rax,%r15
+ adcq $0,%rdx
+ addq %r15,%r14
+ movq 0(%rbp),%rax
+ movq %rdx,%r15
+ adcq $0,%r15
+
+ decl %ecx
+ jnz .L8x_tail
+
+ leaq 128(%rbp),%rbp
+ movq 8+8(%rsp),%rdx
+ cmpq 0+8(%rsp),%rbp
+ jae .L8x_tail_done
+
+ movq 48+56+8(%rsp),%rbx
+ negq %rsi
+ movq 0(%rbp),%rax
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ sbbq %rsi,%rsi
+
+ movl $8,%ecx
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail_done:
+ addq (%rdx),%r8
+ xorq %rax,%rax
+
+ negq %rsi
+.L8x_no_tail:
+ adcq 0(%rdi),%r8
+ adcq 8(%rdi),%r9
+ adcq 16(%rdi),%r10
+ adcq 24(%rdi),%r11
+ adcq 32(%rdi),%r12
+ adcq 40(%rdi),%r13
+ adcq 48(%rdi),%r14
+ adcq 56(%rdi),%r15
+ adcq $0,%rax
+ movq -16(%rbp),%rcx
+ xorq %rsi,%rsi
+
+.byte 102,72,15,126,213
+
+ movq %r8,0(%rdi)
+ movq %r9,8(%rdi)
+.byte 102,73,15,126,217
+ movq %r10,16(%rdi)
+ movq %r11,24(%rdi)
+ movq %r12,32(%rdi)
+ movq %r13,40(%rdi)
+ movq %r14,48(%rdi)
+ movq %r15,56(%rdi)
+ leaq 64(%rdi),%rdi
+
+ cmpq %rdx,%rdi
+ jb .L8x_reduction_loop
+
+ subq %r15,%rcx
+ leaq (%rdi,%r9,1),%rbx
+ adcq %rsi,%rsi
+ movq %r9,%rcx
+ orq %rsi,%rax
+.byte 102,72,15,126,207
+ xorq $1,%rax
+.byte 102,72,15,126,206
+ leaq (%rbp,%rax,8),%rbp
+ sarq $3+2,%rcx
+ jmp .Lsqr4x_sub
+
+.align 32
+.Lsqr4x_sub:
+.byte 0x66
+ movq 0(%rbx),%r12
+ movq 8(%rbx),%r13
+ sbbq 0(%rbp),%r12
+ movq 16(%rbx),%r14
+ sbbq 16(%rbp),%r13
+ movq 24(%rbx),%r15
+ leaq 32(%rbx),%rbx
+ sbbq 32(%rbp),%r14
+ movq %r12,0(%rdi)
+ sbbq 48(%rbp),%r15
+ leaq 64(%rbp),%rbp
+ movq %r13,8(%rdi)
+ movq %r14,16(%rdi)
+ movq %r15,24(%rdi)
+ leaq 32(%rdi),%rdi
+
+ incq %rcx
+ jnz .Lsqr4x_sub
+ movq %r9,%r10
+ negq %r9
+ .byte 0xf3,0xc3
+.size bn_sqr8x_internal,.-bn_sqr8x_internal
+.globl bn_from_montgomery
+.hidden bn_from_montgomery
+.type bn_from_montgomery,@function
+.align 32
+bn_from_montgomery:
+ testl $7,%r9d
+ jz bn_from_mont8x
+ xorl %eax,%eax
+ .byte 0xf3,0xc3
+.size bn_from_montgomery,.-bn_from_montgomery
+
+.type bn_from_mont8x,@function
+.align 32
+bn_from_mont8x:
+.byte 0x67
+ movq %rsp,%rax
+ pushq %rbx
+ pushq %rbp
+ pushq %r12
+ pushq %r13
+ pushq %r14
+ pushq %r15
+.byte 0x67
+ movl %r9d,%r10d
+ shll $3,%r9d
+ shll $3+2,%r10d
+ negq %r9
+ movq (%r8),%r8
+
+
+
+
+
+
+
+ leaq -64(%rsp,%r9,2),%r11
+ subq %rsi,%r11
+ andq $4095,%r11
+ cmpq %r11,%r10
+ jb .Lfrom_sp_alt
+ subq %r11,%rsp
+ leaq -64(%rsp,%r9,2),%rsp
+ jmp .Lfrom_sp_done
+
+.align 32
+.Lfrom_sp_alt:
+ leaq 4096-64(,%r9,2),%r10
+ leaq -64(%rsp,%r9,2),%rsp
+ subq %r10,%r11
+ movq $0,%r10
+ cmovcq %r10,%r11
+ subq %r11,%rsp
+.Lfrom_sp_done:
+ andq $-64,%rsp
+ movq %r9,%r10
+ negq %r9
+
+
+
+
+
+
+
+
+
+
+ movq %r8,32(%rsp)
+ movq %rax,40(%rsp)
+.Lfrom_body:
+ movq %r9,%r11
+ leaq 48(%rsp),%rax
+ pxor %xmm0,%xmm0
+ jmp .Lmul_by_1
+
+.align 32
+.Lmul_by_1:
+ movdqu (%rsi),%xmm1
+ movdqu 16(%rsi),%xmm2
+ movdqu 32(%rsi),%xmm3
+ movdqa %xmm0,(%rax,%r9,1)
+ movdqu 48(%rsi),%xmm4
+ movdqa %xmm0,16(%rax,%r9,1)
+.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
+ movdqa %xmm1,(%rax)
+ movdqa %xmm0,32(%rax,%r9,1)
+ movdqa %xmm2,16(%rax)
+ movdqa %xmm0,48(%rax,%r9,1)
+ movdqa %xmm3,32(%rax)
+ movdqa %xmm4,48(%rax)
+ leaq 64(%rax),%rax
+ subq $64,%r11
+ jnz .Lmul_by_1
+
+.byte 102,72,15,110,207
+.byte 102,72,15,110,209
+.byte 0x67
+ movq %rcx,%rbp
+.byte 102,73,15,110,218
+ call sqr8x_reduction
+
+ pxor %xmm0,%xmm0
+ leaq 48(%rsp),%rax
+ movq 40(%rsp),%rsi
+ jmp .Lfrom_mont_zero
+
+.align 32
+.Lfrom_mont_zero:
+ movdqa %xmm0,0(%rax)
+ movdqa %xmm0,16(%rax)
+ movdqa %xmm0,32(%rax)
+ movdqa %xmm0,48(%rax)
+ leaq 64(%rax),%rax
+ subq $32,%r9
+ jnz .Lfrom_mont_zero
+
+ movq $1,%rax
+ movq -48(%rsi),%r15
+ movq -40(%rsi),%r14
+ movq -32(%rsi),%r13
+ movq -24(%rsi),%r12
+ movq -16(%rsi),%rbp
+ movq -8(%rsi),%rbx
+ leaq (%rsi),%rsp
+.Lfrom_epilogue:
+ .byte 0xf3,0xc3
+.size bn_from_mont8x,.-bn_from_mont8x
+.globl bn_scatter5
+.hidden bn_scatter5
+.type bn_scatter5,@function
+.align 16
+bn_scatter5:
+ cmpl $0,%esi
+ jz .Lscatter_epilogue
+ leaq (%rdx,%rcx,8),%rdx
+.Lscatter:
+ movq (%rdi),%rax
+ leaq 8(%rdi),%rdi
+ movq %rax,(%rdx)
+ leaq 256(%rdx),%rdx
+ subl $1,%esi
+ jnz .Lscatter
+.Lscatter_epilogue:
+ .byte 0xf3,0xc3
+.size bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.hidden bn_gather5
+.type bn_gather5,@function
+.align 16
+bn_gather5:
+ movl %ecx,%r11d
+ shrl $3,%ecx
+ andq $7,%r11
+ notl %ecx
+ leaq .Lmagic_masks(%rip),%rax
+ andl $3,%ecx
+ leaq 128(%rdx,%r11,8),%rdx
+ movq 0(%rax,%rcx,8),%xmm4
+ movq 8(%rax,%rcx,8),%xmm5
+ movq 16(%rax,%rcx,8),%xmm6
+ movq 24(%rax,%rcx,8),%xmm7
+ jmp .Lgather
+.align 16
+.Lgather:
+ movq -128(%rdx),%xmm0
+ movq -64(%rdx),%xmm1
+ pand %xmm4,%xmm0
+ movq 0(%rdx),%xmm2
+ pand %xmm5,%xmm1
+ movq 64(%rdx),%xmm3
+ pand %xmm6,%xmm2
+ por %xmm1,%xmm0
+ pand %xmm7,%xmm3
+.byte 0x67,0x67
+ por %xmm2,%xmm0
+ leaq 256(%rdx),%rdx
+ por %xmm3,%xmm0
+
+ movq %xmm0,(%rdi)
+ leaq 8(%rdi),%rdi
+ subl $1,%esi
+ jnz .Lgather
+ .byte 0xf3,0xc3
+.LSEH_end_bn_gather5:
+.size bn_gather5,.-bn_gather5
+.align 64
+.Lmagic_masks:
+.long 0,0, 0,0, 0,0, -1,-1
+.long 0,0, 0,0, 0,0, 0,0
+.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
+#endif