naiveproxy/third_party/boringssl/ios-aarch64/crypto/chacha/chacha-armv8.S
2018-01-28 13:32:06 -05:00

1970 lines
39 KiB
ArmAsm

#include <openssl/arm_arch.h>
.text
.align 5
Lsigma:
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
Lone:
.long 1,0,0,0
LOPENSSL_armcap_P:
#ifdef __ILP32__
.long _OPENSSL_armcap_P-.
#else
.quad _OPENSSL_armcap_P-.
#endif
.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.globl _ChaCha20_ctr32
.private_extern _ChaCha20_ctr32
.align 5
_ChaCha20_ctr32:
cbz x2,Labort
adr x5,LOPENSSL_armcap_P
cmp x2,#192
b.lo Lshort
#ifdef __ILP32__
ldrsw x6,[x5]
#else
ldr x6,[x5]
#endif
ldr w17,[x6,x5]
tst w17,#ARMV7_NEON
b.ne ChaCha20_neon
Lshort:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr x5,Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
sub sp,sp,#64
ldp x22,x23,[x5] // load sigma
ldp x24,x25,[x3] // load key
ldp x26,x27,[x3,#16]
ldp x28,x30,[x4] // load counter
#ifdef __ARMEB__
ror x24,x24,#32
ror x25,x25,#32
ror x26,x26,#32
ror x27,x27,#32
ror x28,x28,#32
ror x30,x30,#32
#endif
Loop_outer:
mov w5,w22 // unpack key block
lsr x6,x22,#32
mov w7,w23
lsr x8,x23,#32
mov w9,w24
lsr x10,x24,#32
mov w11,w25
lsr x12,x25,#32
mov w13,w26
lsr x14,x26,#32
mov w15,w27
lsr x16,x27,#32
mov w17,w28
lsr x19,x28,#32
mov w20,w30
lsr x21,x30,#32
mov x4,#10
subs x2,x2,#64
Loop:
sub x4,x4,#1
add w5,w5,w9
add w6,w6,w10
add w7,w7,w11
add w8,w8,w12
eor w17,w17,w5
eor w19,w19,w6
eor w20,w20,w7
eor w21,w21,w8
ror w17,w17,#16
ror w19,w19,#16
ror w20,w20,#16
ror w21,w21,#16
add w13,w13,w17
add w14,w14,w19
add w15,w15,w20
add w16,w16,w21
eor w9,w9,w13
eor w10,w10,w14
eor w11,w11,w15
eor w12,w12,w16
ror w9,w9,#20
ror w10,w10,#20
ror w11,w11,#20
ror w12,w12,#20
add w5,w5,w9
add w6,w6,w10
add w7,w7,w11
add w8,w8,w12
eor w17,w17,w5
eor w19,w19,w6
eor w20,w20,w7
eor w21,w21,w8
ror w17,w17,#24
ror w19,w19,#24
ror w20,w20,#24
ror w21,w21,#24
add w13,w13,w17
add w14,w14,w19
add w15,w15,w20
add w16,w16,w21
eor w9,w9,w13
eor w10,w10,w14
eor w11,w11,w15
eor w12,w12,w16
ror w9,w9,#25
ror w10,w10,#25
ror w11,w11,#25
ror w12,w12,#25
add w5,w5,w10
add w6,w6,w11
add w7,w7,w12
add w8,w8,w9
eor w21,w21,w5
eor w17,w17,w6
eor w19,w19,w7
eor w20,w20,w8
ror w21,w21,#16
ror w17,w17,#16
ror w19,w19,#16
ror w20,w20,#16
add w15,w15,w21
add w16,w16,w17
add w13,w13,w19
add w14,w14,w20
eor w10,w10,w15
eor w11,w11,w16
eor w12,w12,w13
eor w9,w9,w14
ror w10,w10,#20
ror w11,w11,#20
ror w12,w12,#20
ror w9,w9,#20
add w5,w5,w10
add w6,w6,w11
add w7,w7,w12
add w8,w8,w9
eor w21,w21,w5
eor w17,w17,w6
eor w19,w19,w7
eor w20,w20,w8
ror w21,w21,#24
ror w17,w17,#24
ror w19,w19,#24
ror w20,w20,#24
add w15,w15,w21
add w16,w16,w17
add w13,w13,w19
add w14,w14,w20
eor w10,w10,w15
eor w11,w11,w16
eor w12,w12,w13
eor w9,w9,w14
ror w10,w10,#25
ror w11,w11,#25
ror w12,w12,#25
ror w9,w9,#25
cbnz x4,Loop
add w5,w5,w22 // accumulate key block
add x6,x6,x22,lsr#32
add w7,w7,w23
add x8,x8,x23,lsr#32
add w9,w9,w24
add x10,x10,x24,lsr#32
add w11,w11,w25
add x12,x12,x25,lsr#32
add w13,w13,w26
add x14,x14,x26,lsr#32
add w15,w15,w27
add x16,x16,x27,lsr#32
add w17,w17,w28
add x19,x19,x28,lsr#32
add w20,w20,w30
add x21,x21,x30,lsr#32
b.lo Ltail
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
ldp x6,x8,[x1,#0] // load input
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
ldp x10,x12,[x1,#16]
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
ldp x14,x16,[x1,#32]
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
ldp x19,x21,[x1,#48]
add x1,x1,#64
#ifdef __ARMEB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor x15,x15,x16
eor x17,x17,x19
eor x20,x20,x21
stp x5,x7,[x0,#0] // store output
add x28,x28,#1 // increment counter
stp x9,x11,[x0,#16]
stp x13,x15,[x0,#32]
stp x17,x20,[x0,#48]
add x0,x0,#64
b.hi Loop_outer
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
Labort:
ret
.align 4
Ltail:
add x2,x2,#64
Less_than_64:
sub x0,x0,#1
add x1,x1,x2
add x0,x0,x2
add x4,sp,x2
neg x2,x2
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
#ifdef __ARMEB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
stp x5,x7,[sp,#0]
stp x9,x11,[sp,#16]
stp x13,x15,[sp,#32]
stp x17,x20,[sp,#48]
Loop_tail:
ldrb w10,[x1,x2]
ldrb w11,[x4,x2]
add x2,x2,#1
eor w10,w10,w11
strb w10,[x0,x2]
cbnz x2,Loop_tail
stp xzr,xzr,[sp,#0]
stp xzr,xzr,[sp,#16]
stp xzr,xzr,[sp,#32]
stp xzr,xzr,[sp,#48]
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.align 5
ChaCha20_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr x5,Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
cmp x2,#512
b.hs L512_or_more_neon
sub sp,sp,#64
ldp x22,x23,[x5] // load sigma
ld1 {v24.4s},[x5],#16
ldp x24,x25,[x3] // load key
ldp x26,x27,[x3,#16]
ld1 {v25.4s,v26.4s},[x3]
ldp x28,x30,[x4] // load counter
ld1 {v27.4s},[x4]
ld1 {v31.4s},[x5]
#ifdef __ARMEB__
rev64 v24.4s,v24.4s
ror x24,x24,#32
ror x25,x25,#32
ror x26,x26,#32
ror x27,x27,#32
ror x28,x28,#32
ror x30,x30,#32
#endif
add v27.4s,v27.4s,v31.4s // += 1
add v28.4s,v27.4s,v31.4s
add v29.4s,v28.4s,v31.4s
shl v31.4s,v31.4s,#2 // 1 -> 4
Loop_outer_neon:
mov w5,w22 // unpack key block
lsr x6,x22,#32
mov v0.16b,v24.16b
mov w7,w23
lsr x8,x23,#32
mov v4.16b,v24.16b
mov w9,w24
lsr x10,x24,#32
mov v16.16b,v24.16b
mov w11,w25
mov v1.16b,v25.16b
lsr x12,x25,#32
mov v5.16b,v25.16b
mov w13,w26
mov v17.16b,v25.16b
lsr x14,x26,#32
mov v3.16b,v27.16b
mov w15,w27
mov v7.16b,v28.16b
lsr x16,x27,#32
mov v19.16b,v29.16b
mov w17,w28
mov v2.16b,v26.16b
lsr x19,x28,#32
mov v6.16b,v26.16b
mov w20,w30
mov v18.16b,v26.16b
lsr x21,x30,#32
mov x4,#10
subs x2,x2,#256
Loop_neon:
sub x4,x4,#1
add v0.4s,v0.4s,v1.4s
add w5,w5,w9
add v4.4s,v4.4s,v5.4s
add w6,w6,w10
add v16.4s,v16.4s,v17.4s
add w7,w7,w11
eor v3.16b,v3.16b,v0.16b
add w8,w8,w12
eor v7.16b,v7.16b,v4.16b
eor w17,w17,w5
eor v19.16b,v19.16b,v16.16b
eor w19,w19,w6
rev32 v3.8h,v3.8h
eor w20,w20,w7
rev32 v7.8h,v7.8h
eor w21,w21,w8
rev32 v19.8h,v19.8h
ror w17,w17,#16
add v2.4s,v2.4s,v3.4s
ror w19,w19,#16
add v6.4s,v6.4s,v7.4s
ror w20,w20,#16
add v18.4s,v18.4s,v19.4s
ror w21,w21,#16
eor v20.16b,v1.16b,v2.16b
add w13,w13,w17
eor v21.16b,v5.16b,v6.16b
add w14,w14,w19
eor v22.16b,v17.16b,v18.16b
add w15,w15,w20
ushr v1.4s,v20.4s,#20
add w16,w16,w21
ushr v5.4s,v21.4s,#20
eor w9,w9,w13
ushr v17.4s,v22.4s,#20
eor w10,w10,w14
sli v1.4s,v20.4s,#12
eor w11,w11,w15
sli v5.4s,v21.4s,#12
eor w12,w12,w16
sli v17.4s,v22.4s,#12
ror w9,w9,#20
add v0.4s,v0.4s,v1.4s
ror w10,w10,#20
add v4.4s,v4.4s,v5.4s
ror w11,w11,#20
add v16.4s,v16.4s,v17.4s
ror w12,w12,#20
eor v20.16b,v3.16b,v0.16b
add w5,w5,w9
eor v21.16b,v7.16b,v4.16b
add w6,w6,w10
eor v22.16b,v19.16b,v16.16b
add w7,w7,w11
ushr v3.4s,v20.4s,#24
add w8,w8,w12
ushr v7.4s,v21.4s,#24
eor w17,w17,w5
ushr v19.4s,v22.4s,#24
eor w19,w19,w6
sli v3.4s,v20.4s,#8
eor w20,w20,w7
sli v7.4s,v21.4s,#8
eor w21,w21,w8
sli v19.4s,v22.4s,#8
ror w17,w17,#24
add v2.4s,v2.4s,v3.4s
ror w19,w19,#24
add v6.4s,v6.4s,v7.4s
ror w20,w20,#24
add v18.4s,v18.4s,v19.4s
ror w21,w21,#24
eor v20.16b,v1.16b,v2.16b
add w13,w13,w17
eor v21.16b,v5.16b,v6.16b
add w14,w14,w19
eor v22.16b,v17.16b,v18.16b
add w15,w15,w20
ushr v1.4s,v20.4s,#25
add w16,w16,w21
ushr v5.4s,v21.4s,#25
eor w9,w9,w13
ushr v17.4s,v22.4s,#25
eor w10,w10,w14
sli v1.4s,v20.4s,#7
eor w11,w11,w15
sli v5.4s,v21.4s,#7
eor w12,w12,w16
sli v17.4s,v22.4s,#7
ror w9,w9,#25
ext v2.16b,v2.16b,v2.16b,#8
ror w10,w10,#25
ext v6.16b,v6.16b,v6.16b,#8
ror w11,w11,#25
ext v18.16b,v18.16b,v18.16b,#8
ror w12,w12,#25
ext v3.16b,v3.16b,v3.16b,#12
ext v7.16b,v7.16b,v7.16b,#12
ext v19.16b,v19.16b,v19.16b,#12
ext v1.16b,v1.16b,v1.16b,#4
ext v5.16b,v5.16b,v5.16b,#4
ext v17.16b,v17.16b,v17.16b,#4
add v0.4s,v0.4s,v1.4s
add w5,w5,w10
add v4.4s,v4.4s,v5.4s
add w6,w6,w11
add v16.4s,v16.4s,v17.4s
add w7,w7,w12
eor v3.16b,v3.16b,v0.16b
add w8,w8,w9
eor v7.16b,v7.16b,v4.16b
eor w21,w21,w5
eor v19.16b,v19.16b,v16.16b
eor w17,w17,w6
rev32 v3.8h,v3.8h
eor w19,w19,w7
rev32 v7.8h,v7.8h
eor w20,w20,w8
rev32 v19.8h,v19.8h
ror w21,w21,#16
add v2.4s,v2.4s,v3.4s
ror w17,w17,#16
add v6.4s,v6.4s,v7.4s
ror w19,w19,#16
add v18.4s,v18.4s,v19.4s
ror w20,w20,#16
eor v20.16b,v1.16b,v2.16b
add w15,w15,w21
eor v21.16b,v5.16b,v6.16b
add w16,w16,w17
eor v22.16b,v17.16b,v18.16b
add w13,w13,w19
ushr v1.4s,v20.4s,#20
add w14,w14,w20
ushr v5.4s,v21.4s,#20
eor w10,w10,w15
ushr v17.4s,v22.4s,#20
eor w11,w11,w16
sli v1.4s,v20.4s,#12
eor w12,w12,w13
sli v5.4s,v21.4s,#12
eor w9,w9,w14
sli v17.4s,v22.4s,#12
ror w10,w10,#20
add v0.4s,v0.4s,v1.4s
ror w11,w11,#20
add v4.4s,v4.4s,v5.4s
ror w12,w12,#20
add v16.4s,v16.4s,v17.4s
ror w9,w9,#20
eor v20.16b,v3.16b,v0.16b
add w5,w5,w10
eor v21.16b,v7.16b,v4.16b
add w6,w6,w11
eor v22.16b,v19.16b,v16.16b
add w7,w7,w12
ushr v3.4s,v20.4s,#24
add w8,w8,w9
ushr v7.4s,v21.4s,#24
eor w21,w21,w5
ushr v19.4s,v22.4s,#24
eor w17,w17,w6
sli v3.4s,v20.4s,#8
eor w19,w19,w7
sli v7.4s,v21.4s,#8
eor w20,w20,w8
sli v19.4s,v22.4s,#8
ror w21,w21,#24
add v2.4s,v2.4s,v3.4s
ror w17,w17,#24
add v6.4s,v6.4s,v7.4s
ror w19,w19,#24
add v18.4s,v18.4s,v19.4s
ror w20,w20,#24
eor v20.16b,v1.16b,v2.16b
add w15,w15,w21
eor v21.16b,v5.16b,v6.16b
add w16,w16,w17
eor v22.16b,v17.16b,v18.16b
add w13,w13,w19
ushr v1.4s,v20.4s,#25
add w14,w14,w20
ushr v5.4s,v21.4s,#25
eor w10,w10,w15
ushr v17.4s,v22.4s,#25
eor w11,w11,w16
sli v1.4s,v20.4s,#7
eor w12,w12,w13
sli v5.4s,v21.4s,#7
eor w9,w9,w14
sli v17.4s,v22.4s,#7
ror w10,w10,#25
ext v2.16b,v2.16b,v2.16b,#8
ror w11,w11,#25
ext v6.16b,v6.16b,v6.16b,#8
ror w12,w12,#25
ext v18.16b,v18.16b,v18.16b,#8
ror w9,w9,#25
ext v3.16b,v3.16b,v3.16b,#4
ext v7.16b,v7.16b,v7.16b,#4
ext v19.16b,v19.16b,v19.16b,#4
ext v1.16b,v1.16b,v1.16b,#12
ext v5.16b,v5.16b,v5.16b,#12
ext v17.16b,v17.16b,v17.16b,#12
cbnz x4,Loop_neon
add w5,w5,w22 // accumulate key block
add v0.4s,v0.4s,v24.4s
add x6,x6,x22,lsr#32
add v4.4s,v4.4s,v24.4s
add w7,w7,w23
add v16.4s,v16.4s,v24.4s
add x8,x8,x23,lsr#32
add v2.4s,v2.4s,v26.4s
add w9,w9,w24
add v6.4s,v6.4s,v26.4s
add x10,x10,x24,lsr#32
add v18.4s,v18.4s,v26.4s
add w11,w11,w25
add v3.4s,v3.4s,v27.4s
add x12,x12,x25,lsr#32
add w13,w13,w26
add v7.4s,v7.4s,v28.4s
add x14,x14,x26,lsr#32
add w15,w15,w27
add v19.4s,v19.4s,v29.4s
add x16,x16,x27,lsr#32
add w17,w17,w28
add v1.4s,v1.4s,v25.4s
add x19,x19,x28,lsr#32
add w20,w20,w30
add v5.4s,v5.4s,v25.4s
add x21,x21,x30,lsr#32
add v17.4s,v17.4s,v25.4s
b.lo Ltail_neon
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
ldp x6,x8,[x1,#0] // load input
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
ldp x10,x12,[x1,#16]
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
ldp x14,x16,[x1,#32]
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
ldp x19,x21,[x1,#48]
add x1,x1,#64
#ifdef __ARMEB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor v0.16b,v0.16b,v20.16b
eor x15,x15,x16
eor v1.16b,v1.16b,v21.16b
eor x17,x17,x19
eor v2.16b,v2.16b,v22.16b
eor x20,x20,x21
eor v3.16b,v3.16b,v23.16b
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
stp x5,x7,[x0,#0] // store output
add x28,x28,#4 // increment counter
stp x9,x11,[x0,#16]
add v27.4s,v27.4s,v31.4s // += 4
stp x13,x15,[x0,#32]
add v28.4s,v28.4s,v31.4s
stp x17,x20,[x0,#48]
add v29.4s,v29.4s,v31.4s
add x0,x0,#64
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
eor v4.16b,v4.16b,v20.16b
eor v5.16b,v5.16b,v21.16b
eor v6.16b,v6.16b,v22.16b
eor v7.16b,v7.16b,v23.16b
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
eor v16.16b,v16.16b,v0.16b
eor v17.16b,v17.16b,v1.16b
eor v18.16b,v18.16b,v2.16b
eor v19.16b,v19.16b,v3.16b
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
b.hi Loop_outer_neon
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
Ltail_neon:
add x2,x2,#256
cmp x2,#64
b.lo Less_than_64
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
ldp x6,x8,[x1,#0] // load input
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
ldp x10,x12,[x1,#16]
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
ldp x14,x16,[x1,#32]
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
ldp x19,x21,[x1,#48]
add x1,x1,#64
#ifdef __ARMEB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor x15,x15,x16
eor x17,x17,x19
eor x20,x20,x21
stp x5,x7,[x0,#0] // store output
add x28,x28,#4 // increment counter
stp x9,x11,[x0,#16]
stp x13,x15,[x0,#32]
stp x17,x20,[x0,#48]
add x0,x0,#64
b.eq Ldone_neon
sub x2,x2,#64
cmp x2,#64
b.lo Less_than_128
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
eor v0.16b,v0.16b,v20.16b
eor v1.16b,v1.16b,v21.16b
eor v2.16b,v2.16b,v22.16b
eor v3.16b,v3.16b,v23.16b
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
b.eq Ldone_neon
sub x2,x2,#64
cmp x2,#64
b.lo Less_than_192
ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
eor v4.16b,v4.16b,v20.16b
eor v5.16b,v5.16b,v21.16b
eor v6.16b,v6.16b,v22.16b
eor v7.16b,v7.16b,v23.16b
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
b.eq Ldone_neon
sub x2,x2,#64
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp]
b Last_neon
Less_than_128:
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp]
b Last_neon
Less_than_192:
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp]
b Last_neon
.align 4
Last_neon:
sub x0,x0,#1
add x1,x1,x2
add x0,x0,x2
add x4,sp,x2
neg x2,x2
Loop_tail_neon:
ldrb w10,[x1,x2]
ldrb w11,[x4,x2]
add x2,x2,#1
eor w10,w10,w11
strb w10,[x0,x2]
cbnz x2,Loop_tail_neon
stp xzr,xzr,[sp,#0]
stp xzr,xzr,[sp,#16]
stp xzr,xzr,[sp,#32]
stp xzr,xzr,[sp,#48]
Ldone_neon:
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret
.align 5
ChaCha20_512_neon:
stp x29,x30,[sp,#-96]!
add x29,sp,#0
adr x5,Lsigma
stp x19,x20,[sp,#16]
stp x21,x22,[sp,#32]
stp x23,x24,[sp,#48]
stp x25,x26,[sp,#64]
stp x27,x28,[sp,#80]
L512_or_more_neon:
sub sp,sp,#128+64
ldp x22,x23,[x5] // load sigma
ld1 {v24.4s},[x5],#16
ldp x24,x25,[x3] // load key
ldp x26,x27,[x3,#16]
ld1 {v25.4s,v26.4s},[x3]
ldp x28,x30,[x4] // load counter
ld1 {v27.4s},[x4]
ld1 {v31.4s},[x5]
#ifdef __ARMEB__
rev64 v24.4s,v24.4s
ror x24,x24,#32
ror x25,x25,#32
ror x26,x26,#32
ror x27,x27,#32
ror x28,x28,#32
ror x30,x30,#32
#endif
add v27.4s,v27.4s,v31.4s // += 1
stp q24,q25,[sp,#0] // off-load key block, invariant part
add v27.4s,v27.4s,v31.4s // not typo
str q26,[sp,#32]
add v28.4s,v27.4s,v31.4s
add v29.4s,v28.4s,v31.4s
add v30.4s,v29.4s,v31.4s
shl v31.4s,v31.4s,#2 // 1 -> 4
stp d8,d9,[sp,#128+0] // meet ABI requirements
stp d10,d11,[sp,#128+16]
stp d12,d13,[sp,#128+32]
stp d14,d15,[sp,#128+48]
sub x2,x2,#512 // not typo
Loop_outer_512_neon:
mov v0.16b,v24.16b
mov v4.16b,v24.16b
mov v8.16b,v24.16b
mov v12.16b,v24.16b
mov v16.16b,v24.16b
mov v20.16b,v24.16b
mov v1.16b,v25.16b
mov w5,w22 // unpack key block
mov v5.16b,v25.16b
lsr x6,x22,#32
mov v9.16b,v25.16b
mov w7,w23
mov v13.16b,v25.16b
lsr x8,x23,#32
mov v17.16b,v25.16b
mov w9,w24
mov v21.16b,v25.16b
lsr x10,x24,#32
mov v3.16b,v27.16b
mov w11,w25
mov v7.16b,v28.16b
lsr x12,x25,#32
mov v11.16b,v29.16b
mov w13,w26
mov v15.16b,v30.16b
lsr x14,x26,#32
mov v2.16b,v26.16b
mov w15,w27
mov v6.16b,v26.16b
lsr x16,x27,#32
add v19.4s,v3.4s,v31.4s // +4
mov w17,w28
add v23.4s,v7.4s,v31.4s // +4
lsr x19,x28,#32
mov v10.16b,v26.16b
mov w20,w30
mov v14.16b,v26.16b
lsr x21,x30,#32
mov v18.16b,v26.16b
stp q27,q28,[sp,#48] // off-load key block, variable part
mov v22.16b,v26.16b
str q29,[sp,#80]
mov x4,#5
subs x2,x2,#512
Loop_upper_neon:
sub x4,x4,#1
add v0.4s,v0.4s,v1.4s
add w5,w5,w9
add v4.4s,v4.4s,v5.4s
add w6,w6,w10
add v8.4s,v8.4s,v9.4s
add w7,w7,w11
add v12.4s,v12.4s,v13.4s
add w8,w8,w12
add v16.4s,v16.4s,v17.4s
eor w17,w17,w5
add v20.4s,v20.4s,v21.4s
eor w19,w19,w6
eor v3.16b,v3.16b,v0.16b
eor w20,w20,w7
eor v7.16b,v7.16b,v4.16b
eor w21,w21,w8
eor v11.16b,v11.16b,v8.16b
ror w17,w17,#16
eor v15.16b,v15.16b,v12.16b
ror w19,w19,#16
eor v19.16b,v19.16b,v16.16b
ror w20,w20,#16
eor v23.16b,v23.16b,v20.16b
ror w21,w21,#16
rev32 v3.8h,v3.8h
add w13,w13,w17
rev32 v7.8h,v7.8h
add w14,w14,w19
rev32 v11.8h,v11.8h
add w15,w15,w20
rev32 v15.8h,v15.8h
add w16,w16,w21
rev32 v19.8h,v19.8h
eor w9,w9,w13
rev32 v23.8h,v23.8h
eor w10,w10,w14
add v2.4s,v2.4s,v3.4s
eor w11,w11,w15
add v6.4s,v6.4s,v7.4s
eor w12,w12,w16
add v10.4s,v10.4s,v11.4s
ror w9,w9,#20
add v14.4s,v14.4s,v15.4s
ror w10,w10,#20
add v18.4s,v18.4s,v19.4s
ror w11,w11,#20
add v22.4s,v22.4s,v23.4s
ror w12,w12,#20
eor v24.16b,v1.16b,v2.16b
add w5,w5,w9
eor v25.16b,v5.16b,v6.16b
add w6,w6,w10
eor v26.16b,v9.16b,v10.16b
add w7,w7,w11
eor v27.16b,v13.16b,v14.16b
add w8,w8,w12
eor v28.16b,v17.16b,v18.16b
eor w17,w17,w5
eor v29.16b,v21.16b,v22.16b
eor w19,w19,w6
ushr v1.4s,v24.4s,#20
eor w20,w20,w7
ushr v5.4s,v25.4s,#20
eor w21,w21,w8
ushr v9.4s,v26.4s,#20
ror w17,w17,#24
ushr v13.4s,v27.4s,#20
ror w19,w19,#24
ushr v17.4s,v28.4s,#20
ror w20,w20,#24
ushr v21.4s,v29.4s,#20
ror w21,w21,#24
sli v1.4s,v24.4s,#12
add w13,w13,w17
sli v5.4s,v25.4s,#12
add w14,w14,w19
sli v9.4s,v26.4s,#12
add w15,w15,w20
sli v13.4s,v27.4s,#12
add w16,w16,w21
sli v17.4s,v28.4s,#12
eor w9,w9,w13
sli v21.4s,v29.4s,#12
eor w10,w10,w14
add v0.4s,v0.4s,v1.4s
eor w11,w11,w15
add v4.4s,v4.4s,v5.4s
eor w12,w12,w16
add v8.4s,v8.4s,v9.4s
ror w9,w9,#25
add v12.4s,v12.4s,v13.4s
ror w10,w10,#25
add v16.4s,v16.4s,v17.4s
ror w11,w11,#25
add v20.4s,v20.4s,v21.4s
ror w12,w12,#25
eor v24.16b,v3.16b,v0.16b
add w5,w5,w10
eor v25.16b,v7.16b,v4.16b
add w6,w6,w11
eor v26.16b,v11.16b,v8.16b
add w7,w7,w12
eor v27.16b,v15.16b,v12.16b
add w8,w8,w9
eor v28.16b,v19.16b,v16.16b
eor w21,w21,w5
eor v29.16b,v23.16b,v20.16b
eor w17,w17,w6
ushr v3.4s,v24.4s,#24
eor w19,w19,w7
ushr v7.4s,v25.4s,#24
eor w20,w20,w8
ushr v11.4s,v26.4s,#24
ror w21,w21,#16
ushr v15.4s,v27.4s,#24
ror w17,w17,#16
ushr v19.4s,v28.4s,#24
ror w19,w19,#16
ushr v23.4s,v29.4s,#24
ror w20,w20,#16
sli v3.4s,v24.4s,#8
add w15,w15,w21
sli v7.4s,v25.4s,#8
add w16,w16,w17
sli v11.4s,v26.4s,#8
add w13,w13,w19
sli v15.4s,v27.4s,#8
add w14,w14,w20
sli v19.4s,v28.4s,#8
eor w10,w10,w15
sli v23.4s,v29.4s,#8
eor w11,w11,w16
add v2.4s,v2.4s,v3.4s
eor w12,w12,w13
add v6.4s,v6.4s,v7.4s
eor w9,w9,w14
add v10.4s,v10.4s,v11.4s
ror w10,w10,#20
add v14.4s,v14.4s,v15.4s
ror w11,w11,#20
add v18.4s,v18.4s,v19.4s
ror w12,w12,#20
add v22.4s,v22.4s,v23.4s
ror w9,w9,#20
eor v24.16b,v1.16b,v2.16b
add w5,w5,w10
eor v25.16b,v5.16b,v6.16b
add w6,w6,w11
eor v26.16b,v9.16b,v10.16b
add w7,w7,w12
eor v27.16b,v13.16b,v14.16b
add w8,w8,w9
eor v28.16b,v17.16b,v18.16b
eor w21,w21,w5
eor v29.16b,v21.16b,v22.16b
eor w17,w17,w6
ushr v1.4s,v24.4s,#25
eor w19,w19,w7
ushr v5.4s,v25.4s,#25
eor w20,w20,w8
ushr v9.4s,v26.4s,#25
ror w21,w21,#24
ushr v13.4s,v27.4s,#25
ror w17,w17,#24
ushr v17.4s,v28.4s,#25
ror w19,w19,#24
ushr v21.4s,v29.4s,#25
ror w20,w20,#24
sli v1.4s,v24.4s,#7
add w15,w15,w21
sli v5.4s,v25.4s,#7
add w16,w16,w17
sli v9.4s,v26.4s,#7
add w13,w13,w19
sli v13.4s,v27.4s,#7
add w14,w14,w20
sli v17.4s,v28.4s,#7
eor w10,w10,w15
sli v21.4s,v29.4s,#7
eor w11,w11,w16
ext v2.16b,v2.16b,v2.16b,#8
eor w12,w12,w13
ext v6.16b,v6.16b,v6.16b,#8
eor w9,w9,w14
ext v10.16b,v10.16b,v10.16b,#8
ror w10,w10,#25
ext v14.16b,v14.16b,v14.16b,#8
ror w11,w11,#25
ext v18.16b,v18.16b,v18.16b,#8
ror w12,w12,#25
ext v22.16b,v22.16b,v22.16b,#8
ror w9,w9,#25
ext v3.16b,v3.16b,v3.16b,#12
ext v7.16b,v7.16b,v7.16b,#12
ext v11.16b,v11.16b,v11.16b,#12
ext v15.16b,v15.16b,v15.16b,#12
ext v19.16b,v19.16b,v19.16b,#12
ext v23.16b,v23.16b,v23.16b,#12
ext v1.16b,v1.16b,v1.16b,#4
ext v5.16b,v5.16b,v5.16b,#4
ext v9.16b,v9.16b,v9.16b,#4
ext v13.16b,v13.16b,v13.16b,#4
ext v17.16b,v17.16b,v17.16b,#4
ext v21.16b,v21.16b,v21.16b,#4
add v0.4s,v0.4s,v1.4s
add w5,w5,w9
add v4.4s,v4.4s,v5.4s
add w6,w6,w10
add v8.4s,v8.4s,v9.4s
add w7,w7,w11
add v12.4s,v12.4s,v13.4s
add w8,w8,w12
add v16.4s,v16.4s,v17.4s
eor w17,w17,w5
add v20.4s,v20.4s,v21.4s
eor w19,w19,w6
eor v3.16b,v3.16b,v0.16b
eor w20,w20,w7
eor v7.16b,v7.16b,v4.16b
eor w21,w21,w8
eor v11.16b,v11.16b,v8.16b
ror w17,w17,#16
eor v15.16b,v15.16b,v12.16b
ror w19,w19,#16
eor v19.16b,v19.16b,v16.16b
ror w20,w20,#16
eor v23.16b,v23.16b,v20.16b
ror w21,w21,#16
rev32 v3.8h,v3.8h
add w13,w13,w17
rev32 v7.8h,v7.8h
add w14,w14,w19
rev32 v11.8h,v11.8h
add w15,w15,w20
rev32 v15.8h,v15.8h
add w16,w16,w21
rev32 v19.8h,v19.8h
eor w9,w9,w13
rev32 v23.8h,v23.8h
eor w10,w10,w14
add v2.4s,v2.4s,v3.4s
eor w11,w11,w15
add v6.4s,v6.4s,v7.4s
eor w12,w12,w16
add v10.4s,v10.4s,v11.4s
ror w9,w9,#20
add v14.4s,v14.4s,v15.4s
ror w10,w10,#20
add v18.4s,v18.4s,v19.4s
ror w11,w11,#20
add v22.4s,v22.4s,v23.4s
ror w12,w12,#20
eor v24.16b,v1.16b,v2.16b
add w5,w5,w9
eor v25.16b,v5.16b,v6.16b
add w6,w6,w10
eor v26.16b,v9.16b,v10.16b
add w7,w7,w11
eor v27.16b,v13.16b,v14.16b
add w8,w8,w12
eor v28.16b,v17.16b,v18.16b
eor w17,w17,w5
eor v29.16b,v21.16b,v22.16b
eor w19,w19,w6
ushr v1.4s,v24.4s,#20
eor w20,w20,w7
ushr v5.4s,v25.4s,#20
eor w21,w21,w8
ushr v9.4s,v26.4s,#20
ror w17,w17,#24
ushr v13.4s,v27.4s,#20
ror w19,w19,#24
ushr v17.4s,v28.4s,#20
ror w20,w20,#24
ushr v21.4s,v29.4s,#20
ror w21,w21,#24
sli v1.4s,v24.4s,#12
add w13,w13,w17
sli v5.4s,v25.4s,#12
add w14,w14,w19
sli v9.4s,v26.4s,#12
add w15,w15,w20
sli v13.4s,v27.4s,#12
add w16,w16,w21
sli v17.4s,v28.4s,#12
eor w9,w9,w13
sli v21.4s,v29.4s,#12
eor w10,w10,w14
add v0.4s,v0.4s,v1.4s
eor w11,w11,w15
add v4.4s,v4.4s,v5.4s
eor w12,w12,w16
add v8.4s,v8.4s,v9.4s
ror w9,w9,#25
add v12.4s,v12.4s,v13.4s
ror w10,w10,#25
add v16.4s,v16.4s,v17.4s
ror w11,w11,#25
add v20.4s,v20.4s,v21.4s
ror w12,w12,#25
eor v24.16b,v3.16b,v0.16b
add w5,w5,w10
eor v25.16b,v7.16b,v4.16b
add w6,w6,w11
eor v26.16b,v11.16b,v8.16b
add w7,w7,w12
eor v27.16b,v15.16b,v12.16b
add w8,w8,w9
eor v28.16b,v19.16b,v16.16b
eor w21,w21,w5
eor v29.16b,v23.16b,v20.16b
eor w17,w17,w6
ushr v3.4s,v24.4s,#24
eor w19,w19,w7
ushr v7.4s,v25.4s,#24
eor w20,w20,w8
ushr v11.4s,v26.4s,#24
ror w21,w21,#16
ushr v15.4s,v27.4s,#24
ror w17,w17,#16
ushr v19.4s,v28.4s,#24
ror w19,w19,#16
ushr v23.4s,v29.4s,#24
ror w20,w20,#16
sli v3.4s,v24.4s,#8
add w15,w15,w21
sli v7.4s,v25.4s,#8
add w16,w16,w17
sli v11.4s,v26.4s,#8
add w13,w13,w19
sli v15.4s,v27.4s,#8
add w14,w14,w20
sli v19.4s,v28.4s,#8
eor w10,w10,w15
sli v23.4s,v29.4s,#8
eor w11,w11,w16
add v2.4s,v2.4s,v3.4s
eor w12,w12,w13
add v6.4s,v6.4s,v7.4s
eor w9,w9,w14
add v10.4s,v10.4s,v11.4s
ror w10,w10,#20
add v14.4s,v14.4s,v15.4s
ror w11,w11,#20
add v18.4s,v18.4s,v19.4s
ror w12,w12,#20
add v22.4s,v22.4s,v23.4s
ror w9,w9,#20
eor v24.16b,v1.16b,v2.16b
add w5,w5,w10
eor v25.16b,v5.16b,v6.16b
add w6,w6,w11
eor v26.16b,v9.16b,v10.16b
add w7,w7,w12
eor v27.16b,v13.16b,v14.16b
add w8,w8,w9
eor v28.16b,v17.16b,v18.16b
eor w21,w21,w5
eor v29.16b,v21.16b,v22.16b
eor w17,w17,w6
ushr v1.4s,v24.4s,#25
eor w19,w19,w7
ushr v5.4s,v25.4s,#25
eor w20,w20,w8
ushr v9.4s,v26.4s,#25
ror w21,w21,#24
ushr v13.4s,v27.4s,#25
ror w17,w17,#24
ushr v17.4s,v28.4s,#25
ror w19,w19,#24
ushr v21.4s,v29.4s,#25
ror w20,w20,#24
sli v1.4s,v24.4s,#7
add w15,w15,w21
sli v5.4s,v25.4s,#7
add w16,w16,w17
sli v9.4s,v26.4s,#7
add w13,w13,w19
sli v13.4s,v27.4s,#7
add w14,w14,w20
sli v17.4s,v28.4s,#7
eor w10,w10,w15
sli v21.4s,v29.4s,#7
eor w11,w11,w16
ext v2.16b,v2.16b,v2.16b,#8
eor w12,w12,w13
ext v6.16b,v6.16b,v6.16b,#8
eor w9,w9,w14
ext v10.16b,v10.16b,v10.16b,#8
ror w10,w10,#25
ext v14.16b,v14.16b,v14.16b,#8
ror w11,w11,#25
ext v18.16b,v18.16b,v18.16b,#8
ror w12,w12,#25
ext v22.16b,v22.16b,v22.16b,#8
ror w9,w9,#25
ext v3.16b,v3.16b,v3.16b,#4
ext v7.16b,v7.16b,v7.16b,#4
ext v11.16b,v11.16b,v11.16b,#4
ext v15.16b,v15.16b,v15.16b,#4
ext v19.16b,v19.16b,v19.16b,#4
ext v23.16b,v23.16b,v23.16b,#4
ext v1.16b,v1.16b,v1.16b,#12
ext v5.16b,v5.16b,v5.16b,#12
ext v9.16b,v9.16b,v9.16b,#12
ext v13.16b,v13.16b,v13.16b,#12
ext v17.16b,v17.16b,v17.16b,#12
ext v21.16b,v21.16b,v21.16b,#12
cbnz x4,Loop_upper_neon
add w5,w5,w22 // accumulate key block
add x6,x6,x22,lsr#32
add w7,w7,w23
add x8,x8,x23,lsr#32
add w9,w9,w24
add x10,x10,x24,lsr#32
add w11,w11,w25
add x12,x12,x25,lsr#32
add w13,w13,w26
add x14,x14,x26,lsr#32
add w15,w15,w27
add x16,x16,x27,lsr#32
add w17,w17,w28
add x19,x19,x28,lsr#32
add w20,w20,w30
add x21,x21,x30,lsr#32
add x5,x5,x6,lsl#32 // pack
add x7,x7,x8,lsl#32
ldp x6,x8,[x1,#0] // load input
add x9,x9,x10,lsl#32
add x11,x11,x12,lsl#32
ldp x10,x12,[x1,#16]
add x13,x13,x14,lsl#32
add x15,x15,x16,lsl#32
ldp x14,x16,[x1,#32]
add x17,x17,x19,lsl#32
add x20,x20,x21,lsl#32
ldp x19,x21,[x1,#48]
add x1,x1,#64
#ifdef __ARMEB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor x15,x15,x16
eor x17,x17,x19
eor x20,x20,x21
stp x5,x7,[x0,#0] // store output
add x28,x28,#1 // increment counter
mov w5,w22 // unpack key block
lsr x6,x22,#32
stp x9,x11,[x0,#16]
mov w7,w23
lsr x8,x23,#32
stp x13,x15,[x0,#32]
mov w9,w24
lsr x10,x24,#32
stp x17,x20,[x0,#48]
add x0,x0,#64
mov w11,w25
lsr x12,x25,#32
mov w13,w26
lsr x14,x26,#32
mov w15,w27
lsr x16,x27,#32
mov w17,w28
lsr x19,x28,#32
mov w20,w30
lsr x21,x30,#32
mov x4,#5
Loop_lower_neon:
sub x4,x4,#1
add v0.4s,v0.4s,v1.4s
add w5,w5,w9
add v4.4s,v4.4s,v5.4s
add w6,w6,w10
add v8.4s,v8.4s,v9.4s
add w7,w7,w11
add v12.4s,v12.4s,v13.4s
add w8,w8,w12
add v16.4s,v16.4s,v17.4s
eor w17,w17,w5
add v20.4s,v20.4s,v21.4s
eor w19,w19,w6
eor v3.16b,v3.16b,v0.16b
eor w20,w20,w7
eor v7.16b,v7.16b,v4.16b
eor w21,w21,w8
eor v11.16b,v11.16b,v8.16b
ror w17,w17,#16
eor v15.16b,v15.16b,v12.16b
ror w19,w19,#16
eor v19.16b,v19.16b,v16.16b
ror w20,w20,#16
eor v23.16b,v23.16b,v20.16b
ror w21,w21,#16
rev32 v3.8h,v3.8h
add w13,w13,w17
rev32 v7.8h,v7.8h
add w14,w14,w19
rev32 v11.8h,v11.8h
add w15,w15,w20
rev32 v15.8h,v15.8h
add w16,w16,w21
rev32 v19.8h,v19.8h
eor w9,w9,w13
rev32 v23.8h,v23.8h
eor w10,w10,w14
add v2.4s,v2.4s,v3.4s
eor w11,w11,w15
add v6.4s,v6.4s,v7.4s
eor w12,w12,w16
add v10.4s,v10.4s,v11.4s
ror w9,w9,#20
add v14.4s,v14.4s,v15.4s
ror w10,w10,#20
add v18.4s,v18.4s,v19.4s
ror w11,w11,#20
add v22.4s,v22.4s,v23.4s
ror w12,w12,#20
eor v24.16b,v1.16b,v2.16b
add w5,w5,w9
eor v25.16b,v5.16b,v6.16b
add w6,w6,w10
eor v26.16b,v9.16b,v10.16b
add w7,w7,w11
eor v27.16b,v13.16b,v14.16b
add w8,w8,w12
eor v28.16b,v17.16b,v18.16b
eor w17,w17,w5
eor v29.16b,v21.16b,v22.16b
eor w19,w19,w6
ushr v1.4s,v24.4s,#20
eor w20,w20,w7
ushr v5.4s,v25.4s,#20
eor w21,w21,w8
ushr v9.4s,v26.4s,#20
ror w17,w17,#24
ushr v13.4s,v27.4s,#20
ror w19,w19,#24
ushr v17.4s,v28.4s,#20
ror w20,w20,#24
ushr v21.4s,v29.4s,#20
ror w21,w21,#24
sli v1.4s,v24.4s,#12
add w13,w13,w17
sli v5.4s,v25.4s,#12
add w14,w14,w19
sli v9.4s,v26.4s,#12
add w15,w15,w20
sli v13.4s,v27.4s,#12
add w16,w16,w21
sli v17.4s,v28.4s,#12
eor w9,w9,w13
sli v21.4s,v29.4s,#12
eor w10,w10,w14
add v0.4s,v0.4s,v1.4s
eor w11,w11,w15
add v4.4s,v4.4s,v5.4s
eor w12,w12,w16
add v8.4s,v8.4s,v9.4s
ror w9,w9,#25
add v12.4s,v12.4s,v13.4s
ror w10,w10,#25
add v16.4s,v16.4s,v17.4s
ror w11,w11,#25
add v20.4s,v20.4s,v21.4s
ror w12,w12,#25
eor v24.16b,v3.16b,v0.16b
add w5,w5,w10
eor v25.16b,v7.16b,v4.16b
add w6,w6,w11
eor v26.16b,v11.16b,v8.16b
add w7,w7,w12
eor v27.16b,v15.16b,v12.16b
add w8,w8,w9
eor v28.16b,v19.16b,v16.16b
eor w21,w21,w5
eor v29.16b,v23.16b,v20.16b
eor w17,w17,w6
ushr v3.4s,v24.4s,#24
eor w19,w19,w7
ushr v7.4s,v25.4s,#24
eor w20,w20,w8
ushr v11.4s,v26.4s,#24
ror w21,w21,#16
ushr v15.4s,v27.4s,#24
ror w17,w17,#16
ushr v19.4s,v28.4s,#24
ror w19,w19,#16
ushr v23.4s,v29.4s,#24
ror w20,w20,#16
sli v3.4s,v24.4s,#8
add w15,w15,w21
sli v7.4s,v25.4s,#8
add w16,w16,w17
sli v11.4s,v26.4s,#8
add w13,w13,w19
sli v15.4s,v27.4s,#8
add w14,w14,w20
sli v19.4s,v28.4s,#8
eor w10,w10,w15
sli v23.4s,v29.4s,#8
eor w11,w11,w16
add v2.4s,v2.4s,v3.4s
eor w12,w12,w13
add v6.4s,v6.4s,v7.4s
eor w9,w9,w14
add v10.4s,v10.4s,v11.4s
ror w10,w10,#20
add v14.4s,v14.4s,v15.4s
ror w11,w11,#20
add v18.4s,v18.4s,v19.4s
ror w12,w12,#20
add v22.4s,v22.4s,v23.4s
ror w9,w9,#20
eor v24.16b,v1.16b,v2.16b
add w5,w5,w10
eor v25.16b,v5.16b,v6.16b
add w6,w6,w11
eor v26.16b,v9.16b,v10.16b
add w7,w7,w12
eor v27.16b,v13.16b,v14.16b
add w8,w8,w9
eor v28.16b,v17.16b,v18.16b
eor w21,w21,w5
eor v29.16b,v21.16b,v22.16b
eor w17,w17,w6
ushr v1.4s,v24.4s,#25
eor w19,w19,w7
ushr v5.4s,v25.4s,#25
eor w20,w20,w8
ushr v9.4s,v26.4s,#25
ror w21,w21,#24
ushr v13.4s,v27.4s,#25
ror w17,w17,#24
ushr v17.4s,v28.4s,#25
ror w19,w19,#24
ushr v21.4s,v29.4s,#25
ror w20,w20,#24
sli v1.4s,v24.4s,#7
add w15,w15,w21
sli v5.4s,v25.4s,#7
add w16,w16,w17
sli v9.4s,v26.4s,#7
add w13,w13,w19
sli v13.4s,v27.4s,#7
add w14,w14,w20
sli v17.4s,v28.4s,#7
eor w10,w10,w15
sli v21.4s,v29.4s,#7
eor w11,w11,w16
ext v2.16b,v2.16b,v2.16b,#8
eor w12,w12,w13
ext v6.16b,v6.16b,v6.16b,#8
eor w9,w9,w14
ext v10.16b,v10.16b,v10.16b,#8
ror w10,w10,#25
ext v14.16b,v14.16b,v14.16b,#8
ror w11,w11,#25
ext v18.16b,v18.16b,v18.16b,#8
ror w12,w12,#25
ext v22.16b,v22.16b,v22.16b,#8
ror w9,w9,#25
ext v3.16b,v3.16b,v3.16b,#12
ext v7.16b,v7.16b,v7.16b,#12
ext v11.16b,v11.16b,v11.16b,#12
ext v15.16b,v15.16b,v15.16b,#12
ext v19.16b,v19.16b,v19.16b,#12
ext v23.16b,v23.16b,v23.16b,#12
ext v1.16b,v1.16b,v1.16b,#4
ext v5.16b,v5.16b,v5.16b,#4
ext v9.16b,v9.16b,v9.16b,#4
ext v13.16b,v13.16b,v13.16b,#4
ext v17.16b,v17.16b,v17.16b,#4
ext v21.16b,v21.16b,v21.16b,#4
add v0.4s,v0.4s,v1.4s
add w5,w5,w9
add v4.4s,v4.4s,v5.4s
add w6,w6,w10
add v8.4s,v8.4s,v9.4s
add w7,w7,w11
add v12.4s,v12.4s,v13.4s
add w8,w8,w12
add v16.4s,v16.4s,v17.4s
eor w17,w17,w5
add v20.4s,v20.4s,v21.4s
eor w19,w19,w6
eor v3.16b,v3.16b,v0.16b
eor w20,w20,w7
eor v7.16b,v7.16b,v4.16b
eor w21,w21,w8
eor v11.16b,v11.16b,v8.16b
ror w17,w17,#16
eor v15.16b,v15.16b,v12.16b
ror w19,w19,#16
eor v19.16b,v19.16b,v16.16b
ror w20,w20,#16
eor v23.16b,v23.16b,v20.16b
ror w21,w21,#16
rev32 v3.8h,v3.8h
add w13,w13,w17
rev32 v7.8h,v7.8h
add w14,w14,w19
rev32 v11.8h,v11.8h
add w15,w15,w20
rev32 v15.8h,v15.8h
add w16,w16,w21
rev32 v19.8h,v19.8h
eor w9,w9,w13
rev32 v23.8h,v23.8h
eor w10,w10,w14
add v2.4s,v2.4s,v3.4s
eor w11,w11,w15
add v6.4s,v6.4s,v7.4s
eor w12,w12,w16
add v10.4s,v10.4s,v11.4s
ror w9,w9,#20
add v14.4s,v14.4s,v15.4s
ror w10,w10,#20
add v18.4s,v18.4s,v19.4s
ror w11,w11,#20
add v22.4s,v22.4s,v23.4s
ror w12,w12,#20
eor v24.16b,v1.16b,v2.16b
add w5,w5,w9
eor v25.16b,v5.16b,v6.16b
add w6,w6,w10
eor v26.16b,v9.16b,v10.16b
add w7,w7,w11
eor v27.16b,v13.16b,v14.16b
add w8,w8,w12
eor v28.16b,v17.16b,v18.16b
eor w17,w17,w5
eor v29.16b,v21.16b,v22.16b
eor w19,w19,w6
ushr v1.4s,v24.4s,#20
eor w20,w20,w7
ushr v5.4s,v25.4s,#20
eor w21,w21,w8
ushr v9.4s,v26.4s,#20
ror w17,w17,#24
ushr v13.4s,v27.4s,#20
ror w19,w19,#24
ushr v17.4s,v28.4s,#20
ror w20,w20,#24
ushr v21.4s,v29.4s,#20
ror w21,w21,#24
sli v1.4s,v24.4s,#12
add w13,w13,w17
sli v5.4s,v25.4s,#12
add w14,w14,w19
sli v9.4s,v26.4s,#12
add w15,w15,w20
sli v13.4s,v27.4s,#12
add w16,w16,w21
sli v17.4s,v28.4s,#12
eor w9,w9,w13
sli v21.4s,v29.4s,#12
eor w10,w10,w14
add v0.4s,v0.4s,v1.4s
eor w11,w11,w15
add v4.4s,v4.4s,v5.4s
eor w12,w12,w16
add v8.4s,v8.4s,v9.4s
ror w9,w9,#25
add v12.4s,v12.4s,v13.4s
ror w10,w10,#25
add v16.4s,v16.4s,v17.4s
ror w11,w11,#25
add v20.4s,v20.4s,v21.4s
ror w12,w12,#25
eor v24.16b,v3.16b,v0.16b
add w5,w5,w10
eor v25.16b,v7.16b,v4.16b
add w6,w6,w11
eor v26.16b,v11.16b,v8.16b
add w7,w7,w12
eor v27.16b,v15.16b,v12.16b
add w8,w8,w9
eor v28.16b,v19.16b,v16.16b
eor w21,w21,w5
eor v29.16b,v23.16b,v20.16b
eor w17,w17,w6
ushr v3.4s,v24.4s,#24
eor w19,w19,w7
ushr v7.4s,v25.4s,#24
eor w20,w20,w8
ushr v11.4s,v26.4s,#24
ror w21,w21,#16
ushr v15.4s,v27.4s,#24
ror w17,w17,#16
ushr v19.4s,v28.4s,#24
ror w19,w19,#16
ushr v23.4s,v29.4s,#24
ror w20,w20,#16
sli v3.4s,v24.4s,#8
add w15,w15,w21
sli v7.4s,v25.4s,#8
add w16,w16,w17
sli v11.4s,v26.4s,#8
add w13,w13,w19
sli v15.4s,v27.4s,#8
add w14,w14,w20
sli v19.4s,v28.4s,#8
eor w10,w10,w15
sli v23.4s,v29.4s,#8
eor w11,w11,w16
add v2.4s,v2.4s,v3.4s
eor w12,w12,w13
add v6.4s,v6.4s,v7.4s
eor w9,w9,w14
add v10.4s,v10.4s,v11.4s
ror w10,w10,#20
add v14.4s,v14.4s,v15.4s
ror w11,w11,#20
add v18.4s,v18.4s,v19.4s
ror w12,w12,#20
add v22.4s,v22.4s,v23.4s
ror w9,w9,#20
eor v24.16b,v1.16b,v2.16b
add w5,w5,w10
eor v25.16b,v5.16b,v6.16b
add w6,w6,w11
eor v26.16b,v9.16b,v10.16b
add w7,w7,w12
eor v27.16b,v13.16b,v14.16b
add w8,w8,w9
eor v28.16b,v17.16b,v18.16b
eor w21,w21,w5
eor v29.16b,v21.16b,v22.16b
eor w17,w17,w6
ushr v1.4s,v24.4s,#25
eor w19,w19,w7
ushr v5.4s,v25.4s,#25
eor w20,w20,w8
ushr v9.4s,v26.4s,#25
ror w21,w21,#24
ushr v13.4s,v27.4s,#25
ror w17,w17,#24
ushr v17.4s,v28.4s,#25
ror w19,w19,#24
ushr v21.4s,v29.4s,#25
ror w20,w20,#24
sli v1.4s,v24.4s,#7
add w15,w15,w21
sli v5.4s,v25.4s,#7
add w16,w16,w17
sli v9.4s,v26.4s,#7
add w13,w13,w19
sli v13.4s,v27.4s,#7
add w14,w14,w20
sli v17.4s,v28.4s,#7
eor w10,w10,w15
sli v21.4s,v29.4s,#7
eor w11,w11,w16
ext v2.16b,v2.16b,v2.16b,#8
eor w12,w12,w13
ext v6.16b,v6.16b,v6.16b,#8
eor w9,w9,w14
ext v10.16b,v10.16b,v10.16b,#8
ror w10,w10,#25
ext v14.16b,v14.16b,v14.16b,#8
ror w11,w11,#25
ext v18.16b,v18.16b,v18.16b,#8
ror w12,w12,#25
ext v22.16b,v22.16b,v22.16b,#8
ror w9,w9,#25
ext v3.16b,v3.16b,v3.16b,#4
ext v7.16b,v7.16b,v7.16b,#4
ext v11.16b,v11.16b,v11.16b,#4
ext v15.16b,v15.16b,v15.16b,#4
ext v19.16b,v19.16b,v19.16b,#4
ext v23.16b,v23.16b,v23.16b,#4
ext v1.16b,v1.16b,v1.16b,#12
ext v5.16b,v5.16b,v5.16b,#12
ext v9.16b,v9.16b,v9.16b,#12
ext v13.16b,v13.16b,v13.16b,#12
ext v17.16b,v17.16b,v17.16b,#12
ext v21.16b,v21.16b,v21.16b,#12
cbnz x4,Loop_lower_neon
add w5,w5,w22 // accumulate key block
ldp q24,q25,[sp,#0]
add x6,x6,x22,lsr#32
ldp q26,q27,[sp,#32]
add w7,w7,w23
ldp q28,q29,[sp,#64]
add x8,x8,x23,lsr#32
add v0.4s,v0.4s,v24.4s
add w9,w9,w24
add v4.4s,v4.4s,v24.4s
add x10,x10,x24,lsr#32
add v8.4s,v8.4s,v24.4s
add w11,w11,w25
add v12.4s,v12.4s,v24.4s
add x12,x12,x25,lsr#32
add v16.4s,v16.4s,v24.4s
add w13,w13,w26
add v20.4s,v20.4s,v24.4s
add x14,x14,x26,lsr#32
add v2.4s,v2.4s,v26.4s
add w15,w15,w27
add v6.4s,v6.4s,v26.4s
add x16,x16,x27,lsr#32
add v10.4s,v10.4s,v26.4s
add w17,w17,w28
add v14.4s,v14.4s,v26.4s
add x19,x19,x28,lsr#32
add v18.4s,v18.4s,v26.4s
add w20,w20,w30
add v22.4s,v22.4s,v26.4s
add x21,x21,x30,lsr#32
add v19.4s,v19.4s,v31.4s // +4
add x5,x5,x6,lsl#32 // pack
add v23.4s,v23.4s,v31.4s // +4
add x7,x7,x8,lsl#32
add v3.4s,v3.4s,v27.4s
ldp x6,x8,[x1,#0] // load input
add v7.4s,v7.4s,v28.4s
add x9,x9,x10,lsl#32
add v11.4s,v11.4s,v29.4s
add x11,x11,x12,lsl#32
add v15.4s,v15.4s,v30.4s
ldp x10,x12,[x1,#16]
add v19.4s,v19.4s,v27.4s
add x13,x13,x14,lsl#32
add v23.4s,v23.4s,v28.4s
add x15,x15,x16,lsl#32
add v1.4s,v1.4s,v25.4s
ldp x14,x16,[x1,#32]
add v5.4s,v5.4s,v25.4s
add x17,x17,x19,lsl#32
add v9.4s,v9.4s,v25.4s
add x20,x20,x21,lsl#32
add v13.4s,v13.4s,v25.4s
ldp x19,x21,[x1,#48]
add v17.4s,v17.4s,v25.4s
add x1,x1,#64
add v21.4s,v21.4s,v25.4s
#ifdef __ARMEB__
rev x5,x5
rev x7,x7
rev x9,x9
rev x11,x11
rev x13,x13
rev x15,x15
rev x17,x17
rev x20,x20
#endif
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
eor x5,x5,x6
eor x7,x7,x8
eor x9,x9,x10
eor x11,x11,x12
eor x13,x13,x14
eor v0.16b,v0.16b,v24.16b
eor x15,x15,x16
eor v1.16b,v1.16b,v25.16b
eor x17,x17,x19
eor v2.16b,v2.16b,v26.16b
eor x20,x20,x21
eor v3.16b,v3.16b,v27.16b
ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64
stp x5,x7,[x0,#0] // store output
add x28,x28,#7 // increment counter
stp x9,x11,[x0,#16]
stp x13,x15,[x0,#32]
stp x17,x20,[x0,#48]
add x0,x0,#64
st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64
ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64
eor v4.16b,v4.16b,v24.16b
eor v5.16b,v5.16b,v25.16b
eor v6.16b,v6.16b,v26.16b
eor v7.16b,v7.16b,v27.16b
st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64
ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64
eor v8.16b,v8.16b,v0.16b
ldp q24,q25,[sp,#0]
eor v9.16b,v9.16b,v1.16b
ldp q26,q27,[sp,#32]
eor v10.16b,v10.16b,v2.16b
eor v11.16b,v11.16b,v3.16b
st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64
ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64
eor v12.16b,v12.16b,v4.16b
eor v13.16b,v13.16b,v5.16b
eor v14.16b,v14.16b,v6.16b
eor v15.16b,v15.16b,v7.16b
st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64
ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64
eor v16.16b,v16.16b,v8.16b
eor v17.16b,v17.16b,v9.16b
eor v18.16b,v18.16b,v10.16b
eor v19.16b,v19.16b,v11.16b
st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
shl v0.4s,v31.4s,#1 // 4 -> 8
eor v20.16b,v20.16b,v12.16b
eor v21.16b,v21.16b,v13.16b
eor v22.16b,v22.16b,v14.16b
eor v23.16b,v23.16b,v15.16b
st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
add v27.4s,v27.4s,v0.4s // += 8
add v28.4s,v28.4s,v0.4s
add v29.4s,v29.4s,v0.4s
add v30.4s,v30.4s,v0.4s
b.hs Loop_outer_512_neon
adds x2,x2,#512
ushr v0.4s,v31.4s,#2 // 4 -> 1
ldp d8,d9,[sp,#128+0] // meet ABI requirements
ldp d10,d11,[sp,#128+16]
ldp d12,d13,[sp,#128+32]
ldp d14,d15,[sp,#128+48]
stp q24,q31,[sp,#0] // wipe off-load area
stp q24,q31,[sp,#32]
stp q24,q31,[sp,#64]
b.eq Ldone_512_neon
cmp x2,#192
sub v27.4s,v27.4s,v0.4s // -= 1
sub v28.4s,v28.4s,v0.4s
sub v29.4s,v29.4s,v0.4s
add sp,sp,#128
b.hs Loop_outer_neon
eor v25.16b,v25.16b,v25.16b
eor v26.16b,v26.16b,v26.16b
eor v27.16b,v27.16b,v27.16b
eor v28.16b,v28.16b,v28.16b
eor v29.16b,v29.16b,v29.16b
eor v30.16b,v30.16b,v30.16b
b Loop_outer
Ldone_512_neon:
ldp x19,x20,[x29,#16]
add sp,sp,#128+64
ldp x21,x22,[x29,#32]
ldp x23,x24,[x29,#48]
ldp x25,x26,[x29,#64]
ldp x27,x28,[x29,#80]
ldp x29,x30,[sp],#96
ret