From ac9fdb056a3da8d2f467c2dcedcd2b4d44f3fd1c Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 21 May 2024 09:42:12 -0700 Subject: [PATCH 01/19] cryptocheck: Don't test Chacha20-Poly1305 with an IV size of 8 OpenSSL 3.0+ doesn't support an IV size of 8 either for the Chacha20 stream cipher or the AEAD combination with Poly1305. This did work previously with OpenSSL 1.1. Reviewed by: markj Sponsored by: AFRL, DARPA Differential Revision: https://reviews.freebsd.org/D45280 (cherry picked from commit 4ebf794a08de04ebf1c520f07bff3f8fdf48819a) --- tools/tools/crypto/cryptocheck.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/tools/crypto/cryptocheck.c b/tools/tools/crypto/cryptocheck.c index 2db283e4f65b..ef3e225e94f6 100644 --- a/tools/tools/crypto/cryptocheck.c +++ b/tools/tools/crypto/cryptocheck.c @@ -136,7 +136,7 @@ * aes-ccm128 128-bit AES-CCM * aes-ccm192 192-bit AES-CCM * aes-ccm256 256-bit AES-CCM - * chacha20-poly1305 Chacha20 with Poly1305 per RFC 8439 + * chacha20-poly1305 Chacha20 (96 bit nonce) with Poly1305 per RFC 8439 */ #include @@ -253,7 +253,7 @@ static const struct alg { .evp_cipher = EVP_aes_256_ccm }, { .name = "chacha20-poly1305", .cipher = CRYPTO_CHACHA20_POLY1305, .type = T_AEAD, .tag_len = POLY1305_HASH_LEN, - .iv_sizes = { CHACHA20_POLY1305_IV_LEN, 8 }, + .iv_sizes = { CHACHA20_POLY1305_IV_LEN }, .evp_cipher = EVP_chacha20_poly1305 }, }; From ca31dfa6e0014dc4075b5c7128bbdab53254ae62 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 9 May 2024 11:49:57 -0700 Subject: [PATCH 02/19] OpenSSL arm_arch.h: Add helper macros for purecap support - PTR(n) can be used to select either x0-x30 or c0-c30. - PTRN(n) can be used for to select either sp or csp. - PTR_WIDTH is the size of a pointer in bytes. --- crypto/openssl/crypto/arm_arch.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/crypto/openssl/crypto/arm_arch.h b/crypto/openssl/crypto/arm_arch.h index 7bedb385d971..d8c9ae6d7796 100644 --- a/crypto/openssl/crypto/arm_arch.h +++ b/crypto/openssl/crypto/arm_arch.h @@ -177,6 +177,22 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; .popsection; # endif + /* + * Support macros for Morello + */ + +# if __ARM_ARCH__>=8 +# ifdef __CHERI_PURE_CAPABILITY__ +# define PTR_WIDTH 16 +# define PTR(n) c ## n +# define PTRN(n) c ## n +# else +# define PTR_WIDTH 8 +# define PTR(n) x ## n +# define PTRN(n) n +# endif +# endif + # endif /* defined __ASSEMBLER__ */ #endif From e41fa987d5238ccd0947c7e3f22be27e522c2dde Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 1 May 2024 10:17:24 -0700 Subject: [PATCH 03/19] OpenSSL arm64cpuid.pl: Add purecap support --- crypto/openssl/crypto/arm64cpuid.pl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/crypto/openssl/crypto/arm64cpuid.pl b/crypto/openssl/crypto/arm64cpuid.pl index 11f0e5027942..4a70430739d4 100755 --- a/crypto/openssl/crypto/arm64cpuid.pl +++ b/crypto/openssl/crypto/arm64cpuid.pl @@ -25,7 +25,9 @@ #include "arm_arch.h" .text +#ifndef __CHERI_PURE_CAPABILITY__ .arch armv8-a+crypto +#endif .align 5 .globl _armv7_neon_probe @@ -106,7 +108,7 @@ b.hi .Lot // len>15 nop .Little: - strb wzr,[x0],#1 // store byte-by-byte + strb wzr,[PTR(0)],#1 // store byte-by-byte subs x1,x1,#1 b.ne .Little .Lret: ret @@ -114,13 +116,13 @@ .align 4 .Lot: tst x0,#7 b.eq .Laligned // inp is aligned - strb wzr,[x0],#1 // store byte-by-byte + strb wzr,[PTR(0)],#1 // store byte-by-byte sub x1,x1,#1 b .Lot .align 4 .Laligned: - str xzr,[x0],#8 // store word-by-word + str xzr,[PTR(0)],#8 // store word-by-word sub x1,x1,#8 tst x1,#-8 b.ne .Laligned // len>=8 @@ -137,8 +139,8 @@ cbz x2,.Lno_data // len==0? cmp x2,#16 b.ne .Loop_cmp - ldp x8,x9,[x0] - ldp x10,x11,[x1] + ldp x8,x9,[PTR(0)] + ldp x10,x11,[PTR(1)] eor x8,x8,x10 eor x9,x9,x11 orr x8,x8,x9 @@ -149,8 +151,8 @@ .align 4 .Loop_cmp: - ldrb w4,[x0],#1 - ldrb w5,[x1],#1 + ldrb w4,[PTR(0)],#1 + ldrb w5,[PTR(1)],#1 eor w4,w4,w5 orr w3,w3,w4 subs x2,x2,#1 From 91da534850f451acf776b769db5bbe13f6e9f718 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 1 May 2024 10:32:16 -0700 Subject: [PATCH 04/19] OpenSSL aesv8-armx.pl: Add purecap support --- crypto/openssl/crypto/aes/asm/aesv8-armx.pl | 125 +++++++++++--------- 1 file changed, 69 insertions(+), 56 deletions(-) diff --git a/crypto/openssl/crypto/aes/asm/aesv8-armx.pl b/crypto/openssl/crypto/aes/asm/aesv8-armx.pl index 30dad3d03456..f8ddcb723cf2 100755 --- a/crypto/openssl/crypto/aes/asm/aesv8-armx.pl +++ b/crypto/openssl/crypto/aes/asm/aesv8-armx.pl @@ -79,7 +79,12 @@ #if __ARM_MAX_ARCH__>=7 ___ -$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour =~ /64/); +#ifndef __CHERI_PURE_CAPABILITY__ +.arch armv8-a+crypto +#endif +.text +___ $code.=<<___ if ($flavour !~ /64/); .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) .fpu neon @@ -101,17 +106,20 @@ # transliterate common code to either flavour with regex vodoo. # {{{ -my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12"); +my ($inp,$bits,$out,$ptr,$rval,$rounds)=("PTR(0)","w1","PTR(2)","PTR(3)","x3","w12"); +my ($inpx,$outx)=("x0","x2"); my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)= $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10)); $code.=<<___; .align 5 +.type .Lrcon,%object .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b +.size .Lrcon, . - .Lrcon .globl ${prefix}_set_encrypt_key .type ${prefix}_set_encrypt_key,%function @@ -122,16 +130,16 @@ $code.=<<___ if ($flavour =~ /64/); AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ___ $code.=<<___; - mov $ptr,#-1 - cmp $inp,#0 + mov $rval,#-1 + cmp $inpx,#0 b.eq .Lenc_key_abort - cmp $out,#0 + cmp $outx,#0 b.eq .Lenc_key_abort - mov $ptr,#-2 + mov $rval,#-2 cmp $bits,#128 b.lt .Lenc_key_abort cmp $bits,#256 @@ -283,11 +291,11 @@ .Ldone: str $rounds,[$out] - mov $ptr,#0 + mov $rval,#0 .Lenc_key_abort: - mov x0,$ptr // return value - `"ldr x29,[sp],#16" if ($flavour =~ /64/)` + mov x0,$rval // return value + `"ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH)" if ($flavour =~ /64/)` ret .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key @@ -298,8 +306,8 @@ ___ $code.=<<___ if ($flavour =~ /64/); AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ___ $code.=<<___ if ($flavour !~ /64/); stmdb sp!,{r4,lr} @@ -340,7 +348,7 @@ ldmia sp!,{r4,pc} ___ $code.=<<___ if ($flavour =~ /64/); - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret ___ @@ -352,7 +360,7 @@ sub gen_block () { my $dir = shift; my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc"); -my ($inp,$out,$key)=map("x$_",(0..2)); +my ($inp,$out,$key)=map("PTR($_)",(0..2)); my $rounds="w3"; my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3)); @@ -422,8 +430,8 @@ () # store, just looks like what the original ECB implementation does. {{{ -my ($inp,$out,$len,$key)=map("x$_",(0..3)); -my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); +my ($inp,$out,$len,$key)=("PTR(0)","PTR(1)","x2","PTR(3)"); +my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","PTR(7)","x8"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); @@ -541,8 +549,8 @@ () .Lecb_big_size: ___ $code.=<<___ if ($flavour =~ /64/); - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ___ $code.=<<___ if ($flavour !~ /64/); mov ip,sp @@ -1217,7 +1225,7 @@ () ldmia sp!,{r4-r8,pc} ___ $code.=<<___ if ($flavour =~ /64/); - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ___ $code.=<<___ if ($flavour =~ /64/); .Lecb_Final_abort: @@ -1228,12 +1236,13 @@ () ___ }}} {{{ -my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; -my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +my ($inp,$out,$len,$key,$ivp)=("PTR(0)","PTR(1)","x2","PTR(3)","PTR(4)"); +my $enc="w5"; +my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","PTR(7)","x8","x12"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); -my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key); +my ($key4,$key5,$key6,$key7)=("PTR(6)","PTR(12)","PTR(14)",$key); ### q8-q15 preloaded key schedule @@ -1246,8 +1255,8 @@ () $code.=<<___ if ($flavour =~ /64/); AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ___ $code.=<<___ if ($flavour !~ /64/); mov ip,sp @@ -1744,7 +1753,7 @@ () ldmia sp!,{r4-r8,pc} ___ $code.=<<___ if ($flavour =~ /64/); - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret ___ $code.=<<___; @@ -1752,8 +1761,8 @@ () ___ }}} {{{ -my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); -my ($rounds,$cnt,$key_)=("w5","w6","x7"); +my ($inp,$out,$len,$key,$ivp)=("PTR(0)","PTR(1)","x2","PTR(3)","PTR(4)"); +my ($rounds,$cnt,$key_)=("w5","w6","PTR(7)"); my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); my $step="x12"; # aliases with $tctr2 @@ -1776,8 +1785,8 @@ () $code.=<<___ if ($flavour =~ /64/); AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ___ $code.=<<___ if ($flavour !~ /64/); mov ip,sp @@ -2196,7 +2205,7 @@ () ldmia sp!,{r4-r10,pc} ___ $code.=<<___ if ($flavour =~ /64/); - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret ___ $code.=<<___; @@ -2234,10 +2243,12 @@ () # plain text block. {{{ -my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); -my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); -my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); +my ($inp,$out,$len,$key1,$key2,$ivp)=("PTR(0)","PTR(1)","x2","PTR(3)","PTR(4)","PTR(5)"); +my ($ivpx)=("x5"); +my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","PTR(7)","x8","x9","x10"); +my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("PTR(13)","w14","w15","PTR(20)"); my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); +my ($tailcntp,$midnump,$constnump)=("PTR(21)","PTR(22)","PTR(19)"); my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); @@ -2346,10 +2357,10 @@ () .Lxts_enc_big_size: ___ $code.=<<___ if ($flavour =~ /64/); - stp $constnumx,$tmpinp,[sp,#-64]! - stp $tailcnt,$midnumx,[sp,#48] - stp $ivd10,$ivd20,[sp,#32] - stp $ivd30,$ivd40,[sp,#16] + stp $constnump,$tmpinp,[PTRN(sp),#-(4*PTR_WIDTH+32)]! + stp $tailcntp,$midnump,[PTRN(sp),#(2*PTR_WIDTH)] + stp $ivd10,$ivd20,[PTRN(sp),#(4*PTR_WIDTH)] + stp $ivd30,$ivd40,[PTRN(sp),#(4*PTR_WIDTH+16)] // tailcnt store the tail value of length%16. and $tailcnt,$len,#0xf @@ -2400,7 +2411,7 @@ () vld1.32 {q8-q9},[$key1] // load key schedule... sub $rounds0,$rounds0,#6 - add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys + add $key_,$key1,$ivpx,lsl#4 // pointer to last 7 round keys sub $rounds0,$rounds0,#2 vld1.32 {q10-q11},[$key_],#32 vld1.32 {q12-q13},[$key_],#32 @@ -2901,10 +2912,10 @@ () vst1.8 {$tmpin},[$out] .Lxts_abort: - ldp $tailcnt,$midnumx,[sp,#48] - ldp $ivd10,$ivd20,[sp,#32] - ldp $ivd30,$ivd40,[sp,#16] - ldp $constnumx,$tmpinp,[sp],#64 + ldp $tailcntp,$midnump,[PTRN(sp),#(2*PTR_WIDTH)] + ldp $ivd10,$ivd20,[PTRN(sp),#(4*PTR_WIDTH)] + ldp $ivd30,$ivd40,[PTRN(sp),#(4*PTR_WIDTH+16)] + ldp $constnump,$tmpinp,[PTRN(sp)],#(4*PTR_WIDTH+32) .Lxts_enc_final_abort: ret .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt @@ -2912,10 +2923,12 @@ () }}} {{{ -my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); -my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); -my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); +my ($inp,$out,$len,$key1,$key2,$ivp)=("PTR(0)","PTR(1)","x2","PTR(3)","PTR(4)","PTR(5)"); +my ($ivpx)=("x5"); +my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","PTR(7)","x8","x9","x10"); +my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("PTR(13)","w14","w15","PTR(20)"); my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); +my ($tailcntp,$midnump,$constnump)=("PTR(21)","PTR(22)","PTR(19)"); my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); @@ -3021,10 +3034,10 @@ () .Lxts_dec_big_size: ___ $code.=<<___ if ($flavour =~ /64/); - stp $constnumx,$tmpinp,[sp,#-64]! - stp $tailcnt,$midnumx,[sp,#48] - stp $ivd10,$ivd20,[sp,#32] - stp $ivd30,$ivd40,[sp,#16] + stp $constnump,$tmpinp,[PTRN(sp),#-(4*PTR_WIDTH+32)]! + stp $tailcntp,$midnump,[PTRN(sp),#(2*PTR_WIDTH)] + stp $ivd10,$ivd20,[PTRN(sp),#(4*PTR_WIDTH)] + stp $ivd30,$ivd40,[PTRN(sp),#(4*PTR_WIDTH+16)] and $tailcnt,$len,#0xf and $len,$len,#-16 @@ -3080,7 +3093,7 @@ () vld1.32 {q8-q9},[$key1] // load key schedule... sub $rounds0,$rounds0,#6 - add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys + add $key_,$key1,$ivpx,lsl#4 // pointer to last 7 round keys sub $rounds0,$rounds0,#2 vld1.32 {q10-q11},[$key_],#32 // load key schedule... vld1.32 {q12-q13},[$key_],#32 @@ -3553,7 +3566,7 @@ () tst $tailcnt,#0xf b.eq .Lxts_dec_abort // Processing the last two blocks with cipher stealing. - mov x7,x3 + mov $key_,$key1 cbnz x2,.Lxts_dec_1st_done vld1.8 {$dat0},[$inp],#16 @@ -3622,10 +3635,10 @@ () vst1.8 {$tmpin},[$out] .Lxts_dec_abort: - ldp $tailcnt,$midnumx,[sp,#48] - ldp $ivd10,$ivd20,[sp,#32] - ldp $ivd30,$ivd40,[sp,#16] - ldp $constnumx,$tmpinp,[sp],#64 + ldp $tailcntp,$midnump,[PTRN(sp),#(2*PTR_WIDTH)] + ldp $ivd10,$ivd20,[PTRN(sp),#(4*PTR_WIDTH)] + ldp $ivd30,$ivd40,[PTRN(sp),#(4*PTR_WIDTH+16)] + ldp $constnump,$tmpinp,[PTRN(sp)],#(4*PTR_WIDTH+32) .Lxts_dec_final_abort: ret From cca401619a516ebf2f63788ea1909b4af34d2f0a Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 2 May 2024 10:34:52 -0700 Subject: [PATCH 05/19] OpenSSL vpaes-armv8.pl: Add purecap support --- crypto/openssl/crypto/aes/asm/vpaes-armv8.pl | 283 ++++++++++--------- 1 file changed, 148 insertions(+), 135 deletions(-) diff --git a/crypto/openssl/crypto/aes/asm/vpaes-armv8.pl b/crypto/openssl/crypto/aes/asm/vpaes-armv8.pl index 49988e9c2b29..a9a66357253c 100755 --- a/crypto/openssl/crypto/aes/asm/vpaes-armv8.pl +++ b/crypto/openssl/crypto/aes/asm/vpaes-armv8.pl @@ -149,7 +149,7 @@ ___ { -my ($inp,$out,$key) = map("x$_",(0..2)); +my ($inp,$out,$key) = map("PTR($_)",(0..2)); my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23)); my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27)); @@ -165,11 +165,11 @@ .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: - adr x10, .Lk_inv + adr PTR(10), .Lk_inv movi v17.16b, #0x0f - ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv - ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo - ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ld1 {v18.2d-v19.2d}, [PTR(10)],#32 // .Lk_inv + ld1 {v20.2d-v23.2d}, [PTR(10)],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d-v27.2d}, [PTR(10)] // .Lk_sb1, .Lk_sb2 ret .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat @@ -191,11 +191,11 @@ .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: - mov x9, $key + mov PTR(9), $key ldr w8, [$key,#240] // pull rounds - adr x11, .Lk_mc_forward+16 + adr PTR(11), .Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + ld1 {v16.2d}, [PTR(9)], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 @@ -208,22 +208,26 @@ .align 4 .Lenc_loop: // middle of middle round - add x10, x11, #0x40 + add PTR(10), PTR(11), #0x40 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + ld1 {v1.2d}, [PTR(11)], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + ld1 {v4.2d}, [PTR(10)] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D +#ifdef __CHERI_PURE_CAPABILITY__ + alignd c11, c11, #6 // and \$0x30, %r11 # ... mod 4 +#else and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 +#endif eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D sub w8, w8, #1 // nr-- @@ -241,15 +245,15 @@ tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_loop // middle of last round - add x10, x11, #0x80 + add PTR(10), PTR(11), #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + ld1 {v1.2d}, [PTR(10)] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A @@ -262,15 +266,15 @@ .align 4 vpaes_encrypt: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ld1 {v7.16b}, [$inp] bl _vpaes_encrypt_preheat bl _vpaes_encrypt_core st1 {v0.16b}, [$out] - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_encrypt,.-vpaes_encrypt @@ -278,11 +282,11 @@ .type _vpaes_encrypt_2x,%function .align 4 _vpaes_encrypt_2x: - mov x9, $key + mov PTR(9), $key ldr w8, [$key,#240] // pull rounds - adr x11, .Lk_mc_forward+16 + adr PTR(11), .Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + ld1 {v16.2d}, [PTR(9)], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b @@ -301,10 +305,10 @@ .align 4 .Lenc_2x_loop: // middle of middle round - add x10, x11, #0x40 + add PTR(10), PTR(11), #0x40 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u tbl v12.16b, {$sb1t}, v10.16b - ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + ld1 {v1.2d}, [PTR(11)], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t tbl v8.16b, {$sb1u}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k @@ -315,7 +319,7 @@ eor v8.16b, v8.16b, v12.16b tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t tbl v10.16b, {$sb2u}, v11.16b - ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + ld1 {v4.2d}, [PTR(10)] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B tbl v11.16b, {v8.16b}, v1.16b eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A @@ -328,7 +332,11 @@ tbl v12.16b, {v11.16b},v1.16b eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D eor v8.16b, v8.16b, v11.16b +#ifdef __CHERI_PURE_CAPABILITY__ + alignd c11, c11, #6 // and \$0x30, %r11 # ... mod 4 +#else and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4 +#endif eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D eor v8.16b, v8.16b, v12.16b sub w8, w8, #1 // nr-- @@ -359,16 +367,16 @@ eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_2x_loop // middle of last round - add x10, x11, #0x80 + add PTR(10), PTR(11), #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou tbl v12.16b, {$sbou}, v10.16b - ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + ld1 {v1.2d}, [PTR(10)] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t tbl v8.16b, {$sbot}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k @@ -383,13 +391,13 @@ .type _vpaes_decrypt_preheat,%function .align 4 _vpaes_decrypt_preheat: - adr x10, .Lk_inv + adr PTR(10), .Lk_inv movi v17.16b, #0x0f - adr x11, .Lk_dipt - ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv - ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo - ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd - ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe + adr PTR(11), .Lk_dipt + ld1 {v18.2d-v19.2d}, [PTR(10)],#32 // .Lk_inv + ld1 {v20.2d-v23.2d}, [PTR(11)],#64 // .Lk_dipt, .Lk_dsbo + ld1 {v24.2d-v27.2d}, [PTR(11)],#64 // .Lk_dsb9, .Lk_dsbd + ld1 {v28.2d-v31.2d}, [PTR(11)] // .Lk_dsbb, .Lk_dsbe ret .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat @@ -401,22 +409,22 @@ .type _vpaes_decrypt_core,%function .align 4 _vpaes_decrypt_core: - mov x9, $key + mov PTR(9), $key ldr w8, [$key,#240] // pull rounds // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 eor x11, x11, #0x30 // xor \$0x30, %r11 - adr x10, .Lk_sr + adr PTR(10), .Lk_sr and x11, x11, #0x30 // and \$0x30, %r11 - add x11, x11, x10 - adr x10, .Lk_mc_forward+48 + add PTR(11), PTR(10), x11 + adr PTR(10), .Lk_mc_forward+48 - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm4 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 - ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + ld1 {v5.2d}, [PTR(10)] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 @@ -475,14 +483,14 @@ tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm0 cbnz w8, .Ldec_loop // middle of last round // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + ld1 {v2.2d}, [PTR(11)] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A @@ -495,15 +503,15 @@ .align 4 vpaes_decrypt: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ld1 {v7.16b}, [$inp] bl _vpaes_decrypt_preheat bl _vpaes_decrypt_core st1 {v0.16b}, [$out] - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_decrypt,.-vpaes_decrypt @@ -512,25 +520,25 @@ .type _vpaes_decrypt_2x,%function .align 4 _vpaes_decrypt_2x: - mov x9, $key + mov PTR(9), $key ldr w8, [$key,#240] // pull rounds // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11 eor x11, x11, #0x30 // xor \$0x30, %r11 - adr x10, .Lk_sr + adr PTR(10), .Lk_sr and x11, x11, #0x30 // and \$0x30, %r11 - add x11, x11, x10 - adr x10, .Lk_mc_forward+48 + add PTR(11), PTR(10), x11 + adr PTR(10), .Lk_mc_forward+48 - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm4 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b ushr v8.16b, v15.16b, #4 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 tbl v10.16b, {$iptlo},v9.16b - ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + ld1 {v5.2d}, [PTR(10)] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 tbl v8.16b, {$ipthi},v8.16b @@ -623,7 +631,7 @@ eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm0 cbnz w8, .Ldec_2x_loop // middle of last round @@ -633,7 +641,7 @@ // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t tbl v9.16b, {$sbot}, v11.16b - ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + ld1 {v2.2d}, [PTR(11)] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A @@ -645,7 +653,8 @@ ___ } { -my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3"); +my ($inp,$bits,$out,$dir)=("PTR(0)","w1","PTR(2)","w3"); +my ($inpx)=("x0"); my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8)); $code.=<<___; @@ -657,18 +666,18 @@ .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: - adr x10, .Lk_inv + adr PTR(10), .Lk_inv movi v16.16b, #0x5b // .Lk_s63 - adr x11, .Lk_sb1 + adr PTR(11), .Lk_sb1 movi v17.16b, #0x0f // .Lk_s0F - ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt - adr x10, .Lk_dksd - ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1 - adr x11, .Lk_mc_forward - ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb - ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 - ld1 {v8.2d}, [x10] // .Lk_rcon - ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ld1 {v18.2d-v21.2d}, [PTR(10)] // .Lk_inv, .Lk_ipt + adr PTR(10), .Lk_dksd + ld1 {v22.2d-v23.2d}, [PTR(11)] // .Lk_sb1 + adr PTR(11), .Lk_mc_forward + ld1 {v24.2d-v27.2d}, [PTR(10)],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d-v31.2d}, [PTR(10)],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [PTR(10)] // .Lk_rcon + ld1 {v9.2d}, [PTR(11)] // .Lk_mc_forward[0] ret .size _vpaes_key_preheat,.-_vpaes_key_preheat @@ -676,8 +685,8 @@ .align 4 _vpaes_schedule_core: AARCH64_SIGN_LINK_REGISTER - stp x29, x30, [sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 bl _vpaes_key_preheat // load the tables @@ -688,8 +697,8 @@ bl _vpaes_schedule_transform mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 - adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 - add x8, x8, x10 + adr PTR(10), .Lk_sr // lea .Lk_sr(%rip),%r10 + add PTR(8), PTR(10), x8 cbnz $dir, .Lschedule_am_decrypting // encrypting, output zeroth round key after transform @@ -698,7 +707,7 @@ .Lschedule_am_decrypting: // decrypting, output zeroth round key after shiftrows - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + ld1 {v1.2d}, [PTR(8)] // vmovdqa (%r8,%r10), %xmm1 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) eor x8, x8, #0x30 // xor \$0x30, %r8 @@ -718,12 +727,12 @@ // are accomplished by the subroutines. // .Lschedule_128: - mov $inp, #10 // mov \$10, %esi + mov $inpx, #10 // mov \$10, %esi .Loop_schedule_128: - sub $inp, $inp, #1 // dec %esi + sub $inpx, $inpx, #1 // dec %esi bl _vpaes_schedule_round - cbz $inp, .Lschedule_mangle_last + cbz $inpx, .Lschedule_mangle_last bl _vpaes_schedule_mangle // write output b .Loop_schedule_128 @@ -750,17 +759,17 @@ mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros - mov $inp, #4 // mov \$4, %esi + mov $inpx, #4 // mov \$4, %esi .Loop_schedule_192: - sub $inp, $inp, #1 // dec %esi + sub $inpx, $inpx, #1 // dec %esi bl _vpaes_schedule_round ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle // save key n bl _vpaes_schedule_192_smear bl _vpaes_schedule_mangle // save key n+1 bl _vpaes_schedule_round - cbz $inp, .Lschedule_mangle_last + cbz $inpx, .Lschedule_mangle_last bl _vpaes_schedule_mangle // save key n+2 bl _vpaes_schedule_192_smear b .Loop_schedule_192 @@ -779,16 +788,16 @@ .Lschedule_256: ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform // input transform - mov $inp, #7 // mov \$7, %esi + mov $inpx, #7 // mov \$7, %esi .Loop_schedule_256: - sub $inp, $inp, #1 // dec %esi + sub $inpx, $inpx, #1 // dec %esi bl _vpaes_schedule_mangle // output low result mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 // high round bl _vpaes_schedule_round - cbz $inp, .Lschedule_mangle_last + cbz $inpx, .Lschedule_mangle_last bl _vpaes_schedule_mangle // low round. swap xmm7 and xmm6 @@ -814,17 +823,17 @@ .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 - adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + adr PTR(11), .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew cbnz $dir, .Lschedule_mangle_last_dec // encrypting - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 - adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + ld1 {v1.2d}, [PTR(8)] // vmovdqa (%r8,%r10),%xmm1 + adr PTR(11), .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform add $out, $out, #32 // add \$32, %rdx tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute .Lschedule_mangle_last_dec: - ld1 {v20.2d-v21.2d}, [x11] // reload constants + ld1 {v20.2d-v21.2d}, [PTR(11)] // reload constants sub $out, $out, #16 // add \$-16, %rdx eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform // output transform @@ -839,7 +848,7 @@ eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 - ldp x29, x30, [sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ -997,7 +1006,7 @@ tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + ld1 {v1.2d}, [PTR(8)] // vmovdqa (%r8,%r10), %xmm1 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 b .Lschedule_mangle_both @@ -1035,7 +1044,7 @@ tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 // vmovdqa 0x70(%r11), %xmm4 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + ld1 {v1.2d}, [PTR(8)] // vmovdqa (%r8,%r10), %xmm1 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 @@ -1043,8 +1052,12 @@ .Lschedule_mangle_both: tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - add x8, x8, #64-16 // add \$-16, %r8 + add PTR(8), PTR(8), #64-16 // add \$-16, %r8 +#ifdef __CHERI_PURE_CAPABILITY__ + alignd c8, c8, #6 // and \$0x30, %r8 +#else and x8, x8, #~(1<<6) // and \$0x30, %r8 +#endif st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx) ret .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle @@ -1054,9 +1067,9 @@ .align 4 vpaes_set_encrypt_key: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so lsr w9, $bits, #5 // shr \$5,%eax add w9, w9, #5 // \$5,%eax @@ -1067,8 +1080,8 @@ bl _vpaes_schedule_core eor x0, x0, x0 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key @@ -1078,9 +1091,9 @@ .align 4 vpaes_set_decrypt_key: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so lsr w9, $bits, #5 // shr \$5,%eax add w9, w9, #5 // \$5,%eax @@ -1095,15 +1108,15 @@ eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32 bl _vpaes_schedule_core - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key ___ } { -my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5)); +my ($inp,$out,$len,$key,$ivec,$dir) = ("PTR(0)","PTR(1)","x2","PTR(3)","PTR(4)","x5"); $code.=<<___; .globl vpaes_cbc_encrypt @@ -1115,11 +1128,11 @@ cmp w5, #0 // check direction b.eq vpaes_cbc_decrypt - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 mov x17, $len // reassign - mov x2, $key // reassign + mov PTR(2), $key // reassign ld1 {v0.16b}, [$ivec] // load ivec bl _vpaes_encrypt_preheat @@ -1136,7 +1149,7 @@ st1 {v0.16b}, [$ivec] // write ivec - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) .Lcbc_abort: AARCH64_VALIDATE_LINK_REGISTER ret @@ -1147,15 +1160,15 @@ vpaes_cbc_decrypt: // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to // only from vpaes_cbc_encrypt which has already signed the return address. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so + stp d10,d11,[PTRN(sp),#-16]! + stp d12,d13,[PTRN(sp),#-16]! + stp d14,d15,[PTRN(sp),#-16]! mov x17, $len // reassign - mov x2, $key // reassign + mov PTR(2), $key // reassign ld1 {v6.16b}, [$ivec] // load ivec bl _vpaes_decrypt_preheat tst x17, #16 @@ -1183,11 +1196,11 @@ .Lcbc_dec_done: st1 {v6.16b}, [$ivec] - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d14,d15,[PTRN(sp)],#16 + ldp d12,d13,[PTRN(sp)],#16 + ldp d10,d11,[PTRN(sp)],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt @@ -1199,15 +1212,15 @@ .align 4 vpaes_ecb_encrypt: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so + stp d10,d11,[PTRN(sp),#-16]! + stp d12,d13,[PTRN(sp),#-16]! + stp d14,d15,[PTRN(sp),#-16]! mov x17, $len - mov x2, $key + mov PTR(2), $key bl _vpaes_encrypt_preheat tst x17, #16 b.eq .Lecb_enc_loop @@ -1227,11 +1240,11 @@ b.hi .Lecb_enc_loop .Lecb_enc_done: - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d14,d15,[PTRN(sp)],#16 + ldp d12,d13,[PTRN(sp)],#16 + ldp d10,d11,[PTRN(sp)],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt @@ -1241,15 +1254,15 @@ .align 4 vpaes_ecb_decrypt: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so + stp d10,d11,[PTRN(sp),#-16]! + stp d12,d13,[PTRN(sp),#-16]! + stp d14,d15,[PTRN(sp),#-16]! mov x17, $len - mov x2, $key + mov PTR(2), $key bl _vpaes_decrypt_preheat tst x17, #16 b.eq .Lecb_dec_loop @@ -1269,11 +1282,11 @@ b.hi .Lecb_dec_loop .Lecb_dec_done: - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d14,d15,[PTRN(sp)],#16 + ldp d12,d13,[PTRN(sp)],#16 + ldp d10,d11,[PTRN(sp)],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt From a36acd247038300ac1ff77f53fa4f926e926e22e Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Tue, 7 May 2024 15:15:15 -0700 Subject: [PATCH 06/19] OpenSSL armv8-mont.pl: Add purecap support --- crypto/openssl/crypto/bn/asm/armv8-mont.pl | 513 +++++++++++++-------- 1 file changed, 329 insertions(+), 184 deletions(-) diff --git a/crypto/openssl/crypto/bn/asm/armv8-mont.pl b/crypto/openssl/crypto/bn/asm/armv8-mont.pl index 21ab12bdf07e..42ebbd47642e 100755 --- a/crypto/openssl/crypto/bn/asm/armv8-mont.pl +++ b/crypto/openssl/crypto/bn/asm/armv8-mont.pl @@ -56,15 +56,21 @@ ($lo0,$hi0,$aj,$m0,$alo,$ahi, $lo1,$hi1,$nj,$m1,$nlo,$nhi, - $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24); + $ovf, $i,$j,$tpx,$tj) = map("x$_",6..17,19..24); +$tp = "PTR(22)"; # int bn_mul_mont( -$rp="x0"; # BN_ULONG *rp, -$ap="x1"; # const BN_ULONG *ap, -$bp="x2"; # const BN_ULONG *bp, -$np="x3"; # const BN_ULONG *np, -$n0="x4"; # const BN_ULONG *n0, +$rp="PTR(0)"; # BN_ULONG *rp, +$ap="PTR(1)"; # const BN_ULONG *ap, +$bp="PTR(2)"; # const BN_ULONG *bp, +$np="PTR(3)"; # const BN_ULONG *np, +$n0p="PTR(4)"; # const BN_ULONG *n0, $num="x5"; # int num); +$rpx="x0"; +$apx="x1"; +$bpx="x2"; +$npx="x3"; +$n0="x4"; $code.=<<___; #include "arm_arch.h" @@ -85,8 +91,8 @@ cmp $num,#32 b.le .Lscalar_impl #ifndef __KERNEL__ - adrp x17,OPENSSL_armv8_rsa_neonized - ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] + adrp PTR(17),OPENSSL_armv8_rsa_neonized + ldr w17,[PTR(17),#:lo12:OPENSSL_armv8_rsa_neonized] cbnz w17, bn_mul8x_mont_neon #endif @@ -97,18 +103,27 @@ b.eq __bn_mul4x_mont .Lmul_mont: - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] + stp PTR(29),PTR(30),[PTRN(sp),#-(8*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] ldr $m0,[$bp],#8 // bp[0] +#ifdef __CHERI_PURE_CAPABILITY__ + neg $lo0,$num + add $tp,csp,$lo0,lsl#3 +#else sub $tp,sp,$num,lsl#3 +#endif ldp $hi0,$aj,[$ap],#16 // ap[0..1] lsl $num,$num,#3 - ldr $n0,[$n0] // *n0 + ldr $n0,[$n0p] // *n0 +#ifdef __CHERI_PURE_CAPABILITY__ + alignd $tp,$tp,#4 // ABI says so +#else and $tp,$tp,#-16 // ABI says so +#endif ldp $hi1,$nj,[$np],#16 // np[0..1] mul $lo0,$hi0,$m0 // ap[0]*bp[0] @@ -118,7 +133,7 @@ umulh $ahi,$aj,$m0 mul $m1,$lo0,$n0 // "tp[0]"*n0 - mov sp,$tp // alloca + mov PTRN(sp),$tp // alloca // (*) mul $lo1,$hi1,$m1 // np[0]*m1 umulh $hi1,$hi1,$m1 @@ -158,13 +173,27 @@ cbnz $j,.L1st .L1st_skip: +#ifdef __CHERI_PURE_CAPABILITY__ + neg $num,$num +#endif adds $lo0,$alo,$hi0 +#ifdef __CHERI_PURE_CAPABILITY__ + add $ap,$ap,$num // rewind $ap +#else sub $ap,$ap,$num // rewind $ap +#endif adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 +#ifdef __CHERI_PURE_CAPABILITY__ + add $np,$np,$num // rewind $np +#else sub $np,$np,$num // rewind $np +#endif adc $hi1,$nhi,xzr +#ifdef __CHERI_PURE_CAPABILITY__ + neg $num,$num +#endif adds $lo1,$lo1,$lo0 sub $i,$num,#8 // i=num-1 @@ -176,8 +205,8 @@ .Louter: ldr $m0,[$bp],#8 // bp[i] ldp $hi0,$aj,[$ap],#16 - ldr $tj,[sp] // tp[0] - add $tp,sp,#8 + ldr $tj,[PTRN(sp)] // tp[0] + add $tp,PTRN(sp),#8 mul $lo0,$hi0,$m0 // ap[0]*bp[i] sub $j,$num,#16 // j=num-2 @@ -223,16 +252,30 @@ cbnz $j,.Linner .Linner_skip: +#ifdef __CHERI_PURE_CAPABILITY__ + neg $num,$num +#endif ldr $tj,[$tp],#8 // tp[j] adc $hi1,$hi1,xzr adds $lo0,$alo,$hi0 +#ifdef __CHERI_PURE_CAPABILITY__ + add $ap,$ap,$num // rewind $ap +#else sub $ap,$ap,$num // rewind $ap +#endif adc $hi0,$ahi,xzr adds $lo1,$nlo,$hi1 +#ifdef __CHERI_PURE_CAPABILITY__ + add $np,$np,$num // rewind $np +#else sub $np,$np,$num // rewind $np +#endif adcs $hi1,$nhi,$ovf adc $ovf,xzr,xzr +#ifdef __CHERI_PURE_CAPABILITY__ + neg $num,$num +#endif adds $lo0,$lo0,$tj adc $hi0,$hi0,xzr @@ -248,8 +291,8 @@ // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. - ldr $tj,[sp] // tp[0] - add $tp,sp,#8 + ldr $tj,[PTRN(sp)] // tp[0] + add $tp,PTRN(sp),#8 ldr $nj,[$np],#8 // np[0] subs $j,$num,#8 // j=num-1 and clear borrow mov $ap,$rp @@ -265,8 +308,8 @@ sbcs $ovf,$ovf,xzr // did it borrow? str $aj,[$ap],#8 // rp[num-1] - ldr $tj,[sp] // tp[0] - add $tp,sp,#8 + ldr $tj,[PTRN(sp)] // tp[0] + add $tp,PTRN(sp),#8 ldr $aj,[$rp],#8 // rp[0] sub $num,$num,#8 // num-- nop @@ -283,12 +326,12 @@ stur xzr,[$tp,#-8] // wipe tp stur $nj,[$rp,#-8] - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + mov PTRN(sp),PTR(29) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] mov x0,#1 - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(8*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size bn_mul_mont,.-bn_mul_mont @@ -304,8 +347,10 @@ my $temp="v15"; my $ACCTemp="v16"; -my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("x$_",(0..5)); -my ($tinptr,$toutptr,$inner,$outer,$bnptr)=map("x$_",(6..11)); +my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=("PTR(0)","PTR(1)","PTR(2)","PTR(3)","PTR(4)","x5"); +my ($tinptr,$toutptr,$inner,$outer,$bnptr)=("PTR(6)","PTR(7)","x8","x9","PTR(10)"); +my ($aptrx,$bptrx)=("x1","x2"); +my ($negnum)=("x12"); $code.=<<___; .type bn_mul8x_mont_neon,%function @@ -313,25 +358,36 @@ bn_mul8x_mont_neon: // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to // only from bn_mul_mont which has already signed the return address. - stp x29,x30,[sp,#-80]! - mov x16,sp - stp d8,d9,[sp,#16] - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH+64)]! + mov PTR(16),PTRN(sp) + stp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] + stp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + stp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + stp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] lsl $num,$num,#1 eor $zero.16b,$zero.16b,$zero.16b +#ifdef __CHERI_PURE_CAPABILITY__ + neg $negnum,$num +#endif .align 4 .LNEON_8n: eor @ACC[0].16b,@ACC[0].16b,@ACC[0].16b - sub $toutptr,sp,#128 + sub $toutptr,PTRN(sp),#128 eor @ACC[1].16b,@ACC[1].16b,@ACC[1].16b +#ifdef __CHERI_PURE_CAPABILITY__ + add $toutptr,$toutptr,$negnum,lsl#4 +#else sub $toutptr,$toutptr,$num,lsl#4 +#endif eor @ACC[2].16b,@ACC[2].16b,@ACC[2].16b +#ifdef __CHERI_PURE_CAPABILITY__ + alignd $toutptr,$toutptr,#6 +#else and $toutptr,$toutptr,#-64 +#endif eor @ACC[3].16b,@ACC[3].16b,@ACC[3].16b - mov sp,$toutptr // alloca + mov PTRN(sp),$toutptr // alloca eor @ACC[4].16b,@ACC[4].16b,@ACC[4].16b add $toutptr,$toutptr,#256 eor @ACC[5].16b,@ACC[5].16b,@ACC[5].16b @@ -347,10 +403,10 @@ st1 {@ACC[6].2d,@ACC[7].2d},[$toutptr],#32 bne .LNEON_8n_init - add $tinptr,sp,#256 + add $tinptr,PTRN(sp),#256 ld1 {$A0.4s,$A1.4s},[$aptr],#32 - add $bnptr,sp,#8 - ldr $sM0,[$n0],#4 + add $bnptr,PTRN(sp),#8 + ldr $sM0,[$n0p],#4 mov $outer,$num b .LNEON_8n_outer @@ -358,7 +414,7 @@ .LNEON_8n_outer: ldr $sBi,[$bptr],#4 // *b++ uxtl $Bi.4s,$Bi.4h - add $toutptr,sp,#128 + add $toutptr,PTRN(sp),#128 ld1 {$N0.4s,$N1.4s},[$nptr],#32 umlal @ACC[0].2d,$Bi.2s,$A0.s[0] @@ -371,7 +427,7 @@ umlal @ACC[4].2d,$Bi.2s,$A1.s[0] mul $Ni.2s,$Ni.2s,$M0.2s umlal @ACC[5].2d,$Bi.2s,$A1.s[1] - st1 {$Bi.2s},[sp] // put aside smashed b[8*i+0] + st1 {$Bi.2s},[PTRN(sp)] // put aside smashed b[8*i+0] umlal @ACC[6].2d,$Bi.2s,$A1.s[2] uxtl $Ni.4s,$Ni.4h umlal @ACC[7].2d,$Bi.2s,$A1.s[3] @@ -416,7 +472,7 @@ ___ } $code.=<<___; - ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] + ld1 {$Bi.2s},[PTRN(sp)] // pull smashed b[8*i+0] umlal @ACC[0].2d,$Ni.2s,$N0.s[0] ld1 {$A0.4s,$A1.4s},[$aptr],#32 umlal @ACC[1].2d,$Ni.2s,$N0.s[1] @@ -435,7 +491,7 @@ umlal @ACC[7].2d,$Ni.2s,$N1.s[3] add @ACC[1].2d,@ACC[1].2d,@ACC[0].2d st1 {$Ni.2s},[$bnptr],#8 // put aside smashed m[8*i+$i] - add $bnptr,sp,#8 // rewind + add $bnptr,PTRN(sp),#8 // rewind ___ push(@ACC,shift(@ACC)); $code.=<<___; @@ -492,14 +548,18 @@ } $code.=<<___; b.ne .LInner_after_rewind$i +#ifdef __CHERI_PURE_CAPABILITY__ + add $aptr,$aptr,$negnum,lsl#2 // rewind +#else sub $aptr,$aptr,$num,lsl#2 // rewind +#endif .LInner_after_rewind$i: umlal @ACC[0].2d,$Ni.2s,$N0.s[0] - ld1 {$Bi.2s},[sp] // pull smashed b[8*i+0] + ld1 {$Bi.2s},[PTRN(sp)] // pull smashed b[8*i+0] umlal @ACC[1].2d,$Ni.2s,$N0.s[1] ld1 {$A0.4s,$A1.4s},[$aptr],#32 umlal @ACC[2].2d,$Ni.2s,$N0.s[2] - add $bnptr,sp,#8 // rewind + add $bnptr,PTRN(sp),#8 // rewind umlal @ACC[3].2d,$Ni.2s,$N0.s[3] umlal @ACC[4].2d,$Ni.2s,$N1.s[0] umlal @ACC[5].2d,$Ni.2s,$N1.s[1] @@ -511,7 +571,7 @@ ___ push(@ACC,shift(@ACC)); $code.=<<___; - add $tinptr,sp,#128 + add $tinptr,PTRN(sp),#128 st1 {@ACC[0].2d,@ACC[1].2d},[$toutptr],#32 eor $N0.16b,$N0.16b,$N0.16b // $N0 st1 {@ACC[2].2d,@ACC[3].2d},[$toutptr],#32 @@ -526,20 +586,24 @@ ld1 {@ACC[6].2d,@ACC[7].2d},[$tinptr],#32 b.eq .LInner_8n_jump_2steps +#ifdef __CHERI_PURE_CAPABILITY__ + add $nptr,$nptr,$negnum,lsl#2 // rewind +#else sub $nptr,$nptr,$num,lsl#2 // rewind +#endif b .LNEON_8n_outer .LInner_8n_jump_2steps: - add $toutptr,sp,#128 - st1 {$N0.2d,$N1.2d}, [sp],#32 // start wiping stack frame + add $toutptr,PTRN(sp),#128 + st1 {$N0.2d,$N1.2d}, [PTRN(sp)],#32 // start wiping stack frame mov $Temp.16b,@ACC[0].16b ushr $temp.2d,@ACC[0].2d,#16 ext @ACC[0].16b,@ACC[0].16b,@ACC[0].16b,#8 - st1 {$N0.2d,$N1.2d}, [sp],#32 + st1 {$N0.2d,$N1.2d}, [PTRN(sp)],#32 add @ACC[0].2d,@ACC[0].2d,$temp.2d - st1 {$N0.2d,$N1.2d}, [sp],#32 + st1 {$N0.2d,$N1.2d}, [PTRN(sp)],#32 ushr $temp.2d,@ACC[0].2d,#16 - st1 {$N0.2d,$N1.2d}, [sp],#32 + st1 {$N0.2d,$N1.2d}, [PTRN(sp)],#32 zip1 @ACC[0].4h,$Temp.4h,@ACC[0].4h ins $temp.d[1],$zero.d[0] @@ -584,9 +648,15 @@ bne .LNEON_tail st1 {$temp.s}[0], [$toutptr],#4 // top-most bit +#ifdef __CHERI_PURE_CAPABILITY__ + add $nptr,$nptr,$negnum,lsl#2 // rewind $nptr + sub $aptr,PTRN(sp),#0 + cmn x0,xzr // clear carry flag +#else sub $nptr,$nptr,$num,lsl#2 // rewind $nptr - subs $aptr,sp,#0 // clear carry flag - add $bptr,sp,$num,lsl#2 + subs $aptr,PTRN(sp),#0 // clear carry flag +#endif + add $bptr,PTRN(sp),$num,lsl#2 .LNEON_sub: ldp w4,w5,[$aptr],#8 @@ -597,7 +667,7 @@ sbcs w9,w5,w9 sbcs w10,w6,w10 sbcs w11,w7,w11 - sub x17,$bptr,$aptr + sub x17,$bptrx,$aptrx stp w8,w9,[$rptr],#8 stp w10,w11,[$rptr],#8 cbnz x17,.LNEON_sub @@ -605,10 +675,15 @@ ldr w10, [$aptr] // load top-most bit mov x11,sp eor v0.16b,v0.16b,v0.16b - sub x11,$bptr,x11 // this is num*4 + sub x11,$bptrx,x11 // this is num*4 eor v1.16b,v1.16b,v1.16b - mov $aptr,sp + mov $aptr,PTRN(sp) +#ifdef __CHERI_PURE_CAPABILITY__ + neg x11,x11 + add $rptr,$rptr,x11 // rewind $rptr +#else sub $rptr,$rptr,x11 // rewind $rptr +#endif mov $nptr,$bptr // second 3/4th of frame sbcs w10,w10,wzr // result is carry flag @@ -642,17 +717,17 @@ .LCopy_2: st1 {v0.2d,v1.2d}, [$aptr],#32 // wipe st1 {v0.2d,v1.2d}, [$nptr],#32 // wipe - sub x17,$bptr,$aptr // preserves carry + sub x17,$bptrx,$aptrx // preserves carry stp w8,w9,[$rptr],#8 stp w10,w11,[$rptr],#8 cbnz x17,.LNEON_copy_n_zap - mov sp,x16 - ldp d14,d15,[sp,#64] - ldp d12,d13,[sp,#48] - ldp d10,d11,[sp,#32] - ldp d8,d9,[sp,#16] - ldr x29,[sp],#80 + mov PTRN(sp),PTR(16) + ldp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] + ldp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + ldp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + ldp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH+64) AARCH64_VALIDATE_LINK_REGISTER ret // bx lr @@ -668,6 +743,8 @@ my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("x$_",(19..26)); my ($cnt,$carry,$topmost)=("x27","x28","x30"); my ($tp,$ap_end,$na0)=($bp,$np,$carry); +my ($t0p,$t1p,$t2p)=map("PTR($_)",(14..16)); +my ($tpx,$ap_endx)=($bpx,$npx); $code.=<<___; .type __bn_sqr8x_mont,%function @@ -678,24 +755,29 @@ .Lsqr8x_mont: // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to // only from bn_mul_mont which has already signed the return address. - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp $rp,$np,[sp,#96] // offload rp and np + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + stp $rp,$np,[PTRN(sp),#(12*PTR_WIDTH)] // offload rp and np ldp $a0,$a1,[$ap,#8*0] ldp $a2,$a3,[$ap,#8*2] ldp $a4,$a5,[$ap,#8*4] ldp $a6,$a7,[$ap,#8*6] +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t0,$num + add $tp,csp,$t0,lsl#4 +#else sub $tp,sp,$num,lsl#4 +#endif lsl $num,$num,#3 - ldr $n0,[$n0] // *n0 - mov sp,$tp // alloca + ldr $n0,[$n0p] // *n0 + mov PTRN(sp),$tp // alloca sub $cnt,$num,#8*8 b .Lsqr8x_zero_start @@ -723,8 +805,8 @@ mov $acc5,xzr mov $acc6,xzr mov $acc7,xzr - mov $tp,sp - str $n0,[x29,#112] // offload n0 + mov $tp,PTRN(sp) + str $n0,[PTR(29),#(14*PTR_WIDTH)] // offload n0 // Multiply everything but a[i]*a[i] .align 4 @@ -882,11 +964,16 @@ umulh $t3,$a7,$a6 // hi(a[7]*a[6]) adc $acc5,xzr,xzr // t[13] adds $acc4,$acc4,$t0 - sub $cnt,$ap_end,$ap // done yet? + sub $cnt,$ap_endx,$apx // done yet? adc $acc5,$acc5,$t1 adds $acc5,$acc5,$t2 - sub $t0,$ap_end,$num // rewinded ap +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t0,$num + add $t0p,$ap_end,$t0 // rewinded ap +#else + sub $t0p,$ap_end,$num // rewinded ap +#endif adc $acc6,xzr,xzr // t[14] add $acc6,$acc6,$t3 @@ -1008,31 +1095,36 @@ ldp $a0,$a1,[$rp,#8*0] add $ap,$rp,#8*8 ldp $a2,$a3,[$rp,#8*2] - sub $t0,$ap_end,$ap // is it last iteration? + sub $t0,$ap_endx,$apx // is it last iteration? ldp $a4,$a5,[$rp,#8*4] - sub $t1,$tp,$t0 +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t0,$t0 + add $t1p,$tp,$t0 +#else + sub $t1p,$tp,$t0 +#endif ldp $a6,$a7,[$rp,#8*6] cbz $t0,.Lsqr8x_outer_loop stp $acc0,$acc1,[$tp,#8*0] - ldp $acc0,$acc1,[$t1,#8*0] + ldp $acc0,$acc1,[$t1p,#8*0] stp $acc2,$acc3,[$tp,#8*2] - ldp $acc2,$acc3,[$t1,#8*2] + ldp $acc2,$acc3,[$t1p,#8*2] stp $acc4,$acc5,[$tp,#8*4] - ldp $acc4,$acc5,[$t1,#8*4] + ldp $acc4,$acc5,[$t1p,#8*4] stp $acc6,$acc7,[$tp,#8*6] - mov $tp,$t1 - ldp $acc6,$acc7,[$t1,#8*6] + mov $tp,$t1p + ldp $acc6,$acc7,[$t1p,#8*6] b .Lsqr8x_outer_loop .align 4 .Lsqr8x_outer_break: // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] - ldp $a1,$a3,[$t0,#8*0] // recall that $t0 is &a[0] - ldp $t1,$t2,[sp,#8*1] - ldp $a5,$a7,[$t0,#8*2] - add $ap,$t0,#8*4 - ldp $t3,$t0,[sp,#8*3] + ldp $a1,$a3,[$t0p,#8*0] // recall that $t0 is &a[0] + ldp $t1,$t2,[PTRN(sp),#8*1] + ldp $a5,$a7,[$t0p,#8*2] + add $ap,$t0p,#8*4 + ldp $t3,$t0,[PTRN(sp),#8*3] stp $acc0,$acc1,[$tp,#8*0] mul $acc0,$a1,$a1 @@ -1041,7 +1133,7 @@ stp $acc4,$acc5,[$tp,#8*4] mul $a2,$a3,$a3 stp $acc6,$acc7,[$tp,#8*6] - mov $tp,sp + mov $tp,PTRN(sp) umulh $a3,$a3,$a3 adds $acc1,$a1,$t1,lsl#1 extr $t1,$t2,$t1,#63 @@ -1087,8 +1179,14 @@ cbnz $cnt,.Lsqr4x_shift_n_add ___ my ($np,$np_end)=($ap,$ap_end); +my ($npx,$np_endx)=($apx,$ap_endx); $code.=<<___; - ldp $np,$n0,[x29,#104] // pull np and n0 +#ifdef __CHERI_PURE_CAPABILITY__ + ldr $np,[PTR(29),#(13*PTR_WIDTH)] // pull np + ldr $n0,[PTR(29),#(14*PTR_WIDTH)] // pull n0 +#else + ldp $np,$n0,[PTR(29),#(13*PTR_WIDTH)] // pull np and n0 +#endif adcs $acc2,$a2,$t1 extr $t2,$t3,$t2,#63 @@ -1103,7 +1201,7 @@ extr $t3,$t0,$t3,#63 adcs $acc4,$a4,$t3 extr $t0,$t1,$t0,#63 - ldp $acc0,$acc1,[sp,#8*0] + ldp $acc0,$acc1,[PTRN(sp),#8*0] adcs $acc5,$a5,$t0 extr $t1,$t2,$t1,#63 ldp $a0,$a1,[$np,#8*0] @@ -1117,14 +1215,14 @@ mul $na0,$n0,$acc0 // t[0]*n0 ldp $a6,$a7,[$np,#8*6] add $np_end,$np,$num - ldp $acc2,$acc3,[sp,#8*2] + ldp $acc2,$acc3,[PTRN(sp),#8*2] stp $acc4,$acc5,[$tp,#8*4] - ldp $acc4,$acc5,[sp,#8*4] + ldp $acc4,$acc5,[PTRN(sp),#8*4] stp $acc6,$acc7,[$tp,#8*6] - ldp $acc6,$acc7,[sp,#8*6] + ldp $acc6,$acc7,[PTRN(sp),#8*6] add $np,$np,#8*8 mov $topmost,xzr // initial top-most carry - mov $tp,sp + mov $tp,PTRN(sp) mov $cnt,#8 .Lsqr8x_reduction: @@ -1170,7 +1268,7 @@ ldp $t0,$t1,[$tp,#8*0] ldp $t2,$t3,[$tp,#8*2] mov $rp,$tp - sub $cnt,$np_end,$np // done yet? + sub $cnt,$np_endx,$npx // done yet? adds $acc0,$acc0,$t0 adcs $acc1,$acc1,$t1 ldp $t0,$t1,[$tp,#8*4] @@ -1235,8 +1333,13 @@ // note that carry flag is guaranteed // to be zero at this point ldp $a0,$a1,[$tp,#8*0] - sub $cnt,$np_end,$np // done yet? - sub $t2,$np_end,$num // rewinded np + sub $cnt,$np_endx,$npx // done yet? +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t2,$num + add $t2p,$np_end,$t2 // rewinded np +#else + sub $t2p,$np_end,$num // rewinded np +#endif ldp $a2,$a3,[$tp,#8*2] ldp $a4,$a5,[$tp,#8*4] ldp $a6,$a7,[$tp,#8*6] @@ -1262,24 +1365,24 @@ .align 4 .Lsqr8x_tail_break: - ldr $n0,[x29,#112] // pull n0 - add $cnt,$tp,#8*8 // end of current t[num] window + ldr $n0,[PTR(29),#(14*PTR_WIDTH)] // pull n0 + add $cnt,$tpx,#8*8 // end of current t[num] window subs xzr,$topmost,#1 // "move" top-most carry to carry bit adcs $t0,$acc0,$a0 adcs $t1,$acc1,$a1 ldp $acc0,$acc1,[$rp,#8*0] adcs $acc2,$acc2,$a2 - ldp $a0,$a1,[$t2,#8*0] // recall that $t2 is &n[0] + ldp $a0,$a1,[$t2p,#8*0] // recall that $t2 is &n[0] adcs $acc3,$acc3,$a3 - ldp $a2,$a3,[$t2,#8*2] + ldp $a2,$a3,[$t2p,#8*2] adcs $acc4,$acc4,$a4 adcs $acc5,$acc5,$a5 - ldp $a4,$a5,[$t2,#8*4] + ldp $a4,$a5,[$t2p,#8*4] adcs $acc6,$acc6,$a6 adcs $acc7,$acc7,$a7 - ldp $a6,$a7,[$t2,#8*6] - add $np,$t2,#8*8 + ldp $a6,$a7,[$t2p,#8*6] + add $np,$t2p,#8*8 adc $topmost,xzr,xzr // top-most carry mul $na0,$n0,$acc0 stp $t0,$t1,[$tp,#8*0] @@ -1298,7 +1401,7 @@ // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. - ldr $rp,[x29,#96] // pull rp + ldr $rp,[PTR(29),#(12*PTR_WIDTH)] // pull rp add $tp,$tp,#8*8 subs $t0,$acc0,$a0 sbcs $t1,$acc1,$a1 @@ -1333,8 +1436,8 @@ cbnz $cnt,.Lsqr8x_sub sbcs $t2,$acc2,$a2 - mov $tp,sp - add $ap,sp,$num + mov $tp,PTRN(sp) + add $ap,PTRN(sp),$num ldp $a0,$a1,[$ap_end,#8*0] sbcs $t3,$acc3,$a3 stp $t0,$t1,[$rp,#8*0] @@ -1347,7 +1450,7 @@ sbcs $t3,$acc7,$a7 ldp $acc2,$acc3,[$ap,#8*2] sbcs xzr,$topmost,xzr // did it borrow? - ldr x30,[x29,#8] // pull return address + ldr PTR(30),[PTR(29),#PTR_WIDTH] // pull return address stp $t0,$t1,[$rp,#8*4] stp $t2,$t3,[$rp,#8*6] @@ -1387,26 +1490,26 @@ .align 4 .Lsqr8x8_post_condition: adc $carry,xzr,xzr - ldr x30,[x29,#8] // pull return address + ldr PTR(30),[PTR(29),#PTR_WIDTH] // pull return address // $acc0-7,$carry hold result, $a0-7 hold modulus subs $a0,$acc0,$a0 - ldr $ap,[x29,#96] // pull rp + ldr $ap,[PTR(29),#(12*PTR_WIDTH)] // pull rp sbcs $a1,$acc1,$a1 - stp xzr,xzr,[sp,#8*0] + stp xzr,xzr,[PTRN(sp),#8*0] sbcs $a2,$acc2,$a2 - stp xzr,xzr,[sp,#8*2] + stp xzr,xzr,[PTRN(sp),#8*2] sbcs $a3,$acc3,$a3 - stp xzr,xzr,[sp,#8*4] + stp xzr,xzr,[PTRN(sp),#8*4] sbcs $a4,$acc4,$a4 - stp xzr,xzr,[sp,#8*6] + stp xzr,xzr,[PTRN(sp),#8*6] sbcs $a5,$acc5,$a5 - stp xzr,xzr,[sp,#8*8] + stp xzr,xzr,[PTRN(sp),#8*8] sbcs $a6,$acc6,$a6 - stp xzr,xzr,[sp,#8*10] + stp xzr,xzr,[PTRN(sp),#8*10] sbcs $a7,$acc7,$a7 - stp xzr,xzr,[sp,#8*12] + stp xzr,xzr,[PTRN(sp),#8*12] sbcs $carry,$carry,xzr // did it borrow? - stp xzr,xzr,[sp,#8*14] + stp xzr,xzr,[PTRN(sp),#8*14] // $a0-7 hold result-modulus csel $a0,$acc0,$a0,lo @@ -1423,14 +1526,14 @@ stp $a6,$a7,[$ap,#8*6] .Lsqr8x_done: - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + mov PTRN(sp),PTR(29) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] mov x0,#1 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(16*PTR_WIDTH) // x30 is loaded earlier AARCH64_VALIDATE_LINK_REGISTER ret @@ -1448,9 +1551,11 @@ $t0,$t1,$t2,$t3, $m0,$m1,$m2,$m3, $acc0,$acc1,$acc2,$acc3,$acc4, - $bi,$mi,$tp,$ap_end,$cnt) = map("x$_",(6..17,19..28)); + $bi,$mi,$cnt) = map("x$_",(6..17,19..25,28)); +my ($tp,$ap_end,$ap_endx) = ("PTR(26)", "PTR(27)","x27"); +my ($t0p,$t1p,$t2p,$t3p) = map("PTR($_)",(10..13)); my $bp_end=$rp; -my ($carry,$topmost) = ($rp,"x30"); +my ($carry,$topmost) = ($rpx,"x30"); $code.=<<___; .type __bn_mul4x_mont,%function @@ -1458,22 +1563,27 @@ __bn_mul4x_mont: // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t0,$num + add $tp,csp,$t0,lsl#3 +#else sub $tp,sp,$num,lsl#3 +#endif lsl $num,$num,#3 - ldr $n0,[$n0] // *n0 - sub sp,$tp,#8*4 // alloca + ldr $n0,[$n0p] // *n0 + sub PTRN(sp),$tp,#8*4 // alloca - add $t0,$bp,$num + add $t0p,$bp,$num add $ap_end,$ap,$num - stp $rp,$t0,[x29,#96] // offload rp and &b[num] + stp $rp,$t0p,[PTR(29),#(12*PTR_WIDTH)] // offload rp and &b[num] ldr $bi,[$bp,#8*0] // b[0] ldp $a0,$a1,[$ap,#8*0] // a[0..3] @@ -1485,10 +1595,15 @@ mov $acc3,xzr ldp $m0,$m1,[$np,#8*0] // n[0..3] ldp $m2,$m3,[$np,#8*2] +#ifdef __CHERI_PURE_CAPABILITY__ + add $np,$np,#8*4 + cmn x0,xzr // clear carry bit +#else adds $np,$np,#8*4 // clear carry bit +#endif mov $carry,xzr mov $cnt,#0 - mov $tp,sp + mov $tp,PTRN(sp) .Loop_mul4x_1st_reduction: mul $t0,$a0,$bi // lo(a[0..3]*b[0]) @@ -1530,7 +1645,7 @@ adcs $acc3,$acc4,$carry adc $carry,xzr,xzr adds $acc0,$acc0,$t0 - sub $t0,$ap_end,$ap + sub $t0,$ap_endx,$apx adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 @@ -1542,7 +1657,7 @@ ldp $a0,$a1,[$ap,#8*0] // a[4..7] ldp $a2,$a3,[$ap,#8*2] add $ap,$ap,#8*4 - ldr $mi,[sp] // a[0]*n0 + ldr $mi,[PTRN(sp)] // a[0]*n0 ldp $m0,$m1,[$np,#8*0] // n[4..7] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 @@ -1583,17 +1698,22 @@ adcs $acc4,$acc4,$carry umulh $t3,$m3,$mi adc $carry,xzr,xzr - ldr $mi,[sp,$cnt] // next t[0]*n0 + ldr $mi,[PTRN(sp),$cnt] // next t[0]*n0 str $acc0,[$tp],#8 // result!!! adds $acc0,$acc1,$t0 - sub $t0,$ap_end,$ap // done yet? + sub $t0,$ap_endx,$apx // done yet? adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 adcs $acc3,$acc4,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_1st_tail - sub $t1,$ap_end,$num // rewinded $ap +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t1,$num + add $t1p,$ap_end,$t1 // rewinded $ap +#else + sub $t1p,$ap_end,$num // rewinded $ap +#endif cbz $t0,.Lmul4x_proceed ldp $a0,$a1,[$ap,#8*0] @@ -1608,20 +1728,30 @@ .Lmul4x_proceed: ldr $bi,[$bp,#8*4]! // *++b adc $topmost,$carry,xzr - ldp $a0,$a1,[$t1,#8*0] // a[0..3] + ldp $a0,$a1,[$t1p,#8*0] // a[0..3] +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t0,$num + add $np,$np,$t0 // rewind np +#else sub $np,$np,$num // rewind np - ldp $a2,$a3,[$t1,#8*2] - add $ap,$t1,#8*4 +#endif + ldp $a2,$a3,[$t1p,#8*2] + add $ap,$t1p,#8*4 stp $acc0,$acc1,[$tp,#8*0] // result!!! - ldp $acc0,$acc1,[sp,#8*4] // t[0..3] + ldp $acc0,$acc1,[PTRN(sp),#8*4] // t[0..3] stp $acc2,$acc3,[$tp,#8*2] // result!!! - ldp $acc2,$acc3,[sp,#8*6] + ldp $acc2,$acc3,[PTRN(sp),#8*6] ldp $m0,$m1,[$np,#8*0] // n[0..3] - mov $tp,sp + mov $tp,PTRN(sp) ldp $m2,$m3,[$np,#8*2] +#ifdef __CHERI_PURE_CAPABILITY__ + add $np,$np,#8*4 + cmn x0,xzr // clear carry bit +#else adds $np,$np,#8*4 // clear carry bit +#endif mov $carry,xzr .align 4 @@ -1683,7 +1813,7 @@ adcs $acc3,$acc3,$t3 //adc $carry,$carry,xzr - ldr $mi,[sp] // t[0]*n0 + ldr $mi,[PTRN(sp)] // t[0]*n0 ldp $m0,$m1,[$np,#8*0] // n[4..7] ldp $m2,$m3,[$np,#8*2] add $np,$np,#8*4 @@ -1724,18 +1854,23 @@ adcs $acc3,$acc3,$t3 umulh $t3,$m3,$mi adcs $acc4,$acc4,$carry - ldr $mi,[sp,$cnt] // next a[0]*n0 + ldr $mi,[PTRN(sp),$cnt] // next a[0]*n0 adc $carry,xzr,xzr str $acc0,[$tp],#8 // result!!! adds $acc0,$acc1,$t0 - sub $t0,$ap_end,$ap // done yet? + sub $t0,$ap_endx,$apx // done yet? adcs $acc1,$acc2,$t1 adcs $acc2,$acc3,$t2 adcs $acc3,$acc4,$t3 //adc $carry,$carry,xzr cbnz $cnt,.Loop_mul4x_tail - sub $t1,$np,$num // rewinded np? +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t1,$num + add $t1p,$np,$t1 // rewinded np? +#else + sub $t1p,$np,$num // rewinded np? +#endif adc $carry,$carry,xzr cbz $t0,.Loop_mul4x_break @@ -1756,30 +1891,40 @@ .align 4 .Loop_mul4x_break: - ldp $t2,$t3,[x29,#96] // pull rp and &b[num] + ldp $t2p,$t3p,[PTR(29),#(12*PTR_WIDTH)] // pull rp and &b[num] adds $acc0,$acc0,$topmost add $bp,$bp,#8*4 // bp++ adcs $acc1,$acc1,xzr +#ifdef __CHERI_PURE_CAPABILITY__ + neg $t0,$num + add $ap,$ap,$t0 // rewind ap +#else sub $ap,$ap,$num // rewind ap +#endif adcs $acc2,$acc2,xzr stp $acc0,$acc1,[$tp,#8*0] // result!!! adcs $acc3,$acc3,xzr - ldp $acc0,$acc1,[sp,#8*4] // t[0..3] + ldp $acc0,$acc1,[PTRN(sp),#8*4] // t[0..3] adc $topmost,$carry,xzr stp $acc2,$acc3,[$tp,#8*2] // result!!! - cmp $bp,$t3 // done yet? - ldp $acc2,$acc3,[sp,#8*6] - ldp $m0,$m1,[$t1,#8*0] // n[0..3] - ldp $m2,$m3,[$t1,#8*2] - add $np,$t1,#8*4 + cmp $bp,$t3p // done yet? + ldp $acc2,$acc3,[PTRN(sp),#8*6] + ldp $m0,$m1,[$t1p,#8*0] // n[0..3] + ldp $m2,$m3,[$t1p,#8*2] + add $np,$t1p,#8*4 b.eq .Lmul4x_post ldr $bi,[$bp] ldp $a0,$a1,[$ap,#8*0] // a[0..3] ldp $a2,$a3,[$ap,#8*2] +#ifdef __CHERI_PURE_CAPABILITY__ + add $ap,$ap,#8*4 + cmn x0,xzr // clear carry bit +#else adds $ap,$ap,#8*4 // clear carry bit +#endif mov $carry,xzr - mov $tp,sp + mov $tp,PTRN(sp) b .Loop_mul4x_reduction .align 4 @@ -1788,10 +1933,10 @@ // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. - mov $rp,$t2 - mov $ap_end,$t2 // $rp copy + mov $rp,$t2p + mov $ap_end,$t2p // $rp copy subs $t0,$acc0,$m0 - add $tp,sp,#8*8 + add $tp,PTRN(sp),#8*8 sbcs $t1,$acc1,$m1 sub $cnt,$num,#8*4 @@ -1813,8 +1958,8 @@ cbnz $cnt,.Lmul4x_sub sbcs $t2,$acc2,$m2 - mov $tp,sp - add $ap,sp,#8*4 + mov $tp,PTRN(sp) + add $ap,PTRN(sp),#8*4 ldp $a0,$a1,[$ap_end,#8*0] sbcs $t3,$acc3,$m3 stp $t0,$t1,[$rp,#8*0] @@ -1823,7 +1968,7 @@ ldp $acc0,$acc1,[$ap,#8*0] ldp $acc2,$acc3,[$ap,#8*2] sbcs xzr,$topmost,xzr // did it borrow? - ldr x30,[x29,#8] // pull return address + ldr PTR(30),[PTR(29),#PTR_WIDTH] // pull return address sub $cnt,$num,#8*4 .Lmul4x_cond_copy: @@ -1861,18 +2006,18 @@ .align 4 .Lmul4x4_post_condition: adc $carry,$carry,xzr - ldr $ap,[x29,#96] // pull rp + ldr $ap,[PTR(29),#(12*PTR_WIDTH)] // pull rp // $acc0-3,$carry hold result, $m0-7 hold modulus subs $a0,$acc0,$m0 - ldr x30,[x29,#8] // pull return address + ldr PTR(30),[PTR(29),#PTR_WIDTH] // pull return address sbcs $a1,$acc1,$m1 - stp xzr,xzr,[sp,#8*0] + stp xzr,xzr,[PTRN(sp),#8*0] sbcs $a2,$acc2,$m2 - stp xzr,xzr,[sp,#8*2] + stp xzr,xzr,[PTRN(sp),#8*2] sbcs $a3,$acc3,$m3 - stp xzr,xzr,[sp,#8*4] + stp xzr,xzr,[PTRN(sp),#8*4] sbcs xzr,$carry,xzr // did it borrow? - stp xzr,xzr,[sp,#8*6] + stp xzr,xzr,[PTRN(sp),#8*6] // $a0-3 hold result-modulus csel $a0,$acc0,$a0,lo @@ -1883,14 +2028,14 @@ stp $a2,$a3,[$ap,#8*2] .Lmul4x_done: - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + mov PTRN(sp),PTR(29) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] mov x0,#1 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(16*PTR_WIDTH) // x30 loaded earlier AARCH64_VALIDATE_LINK_REGISTER ret From 560d92d8776312fffea0ac5e510f2bc034cb2e40 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 9 May 2024 11:40:51 -0700 Subject: [PATCH 07/19] OpenSSL chacha-armv8.pl: Add purecap support --- .../openssl/crypto/chacha/asm/chacha-armv8.pl | 252 +++++++++--------- 1 file changed, 127 insertions(+), 125 deletions(-) diff --git a/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl b/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl index e1a8b8159421..59be73d58fec 100755 --- a/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl +++ b/crypto/openssl/crypto/chacha/asm/chacha-armv8.pl @@ -65,10 +65,12 @@ () $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; } -my ($out,$inp,$len,$key,$ctr) = map("x$_",(0..4)); +my ($out,$inp,$len,$key,$ctr) = ("PTR(0)","PTR(1)","x2","PTR(3)","x4"); +my ($ctrp) = ("PTR(4)"); my @x=map("x$_",(5..17,19..21)); my @d=map("x$_",(22..28,30)); +my @p=map("PTR($_)",(5..17,19..21)); sub ROUND { my ($a0,$b0,$c0,$d0)=@_; @@ -159,28 +161,28 @@ sub ROUND { b.lo .Lshort #ifndef __KERNEL__ - adrp x17,OPENSSL_armcap_P - ldr w17,[x17,#:lo12:OPENSSL_armcap_P] + adrp PTR(17),OPENSSL_armcap_P + ldr w17,[PTR(17),#:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON b.ne .LChaCha20_neon #endif .Lshort: - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr @x[0],.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - ldp @d[0],@d[1],[@x[0]] // load sigma + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + + adr @p[0],.Lsigma + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#64 + + ldp @d[0],@d[1],[@p[0]] // load sigma ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] - ldp @d[6],@d[7],[$ctr] // load counter + ldp @d[6],@d[7],[$ctrp] // load counter #ifdef __AARCH64EB__ ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 @@ -278,13 +280,13 @@ sub ROUND { b.hi .Loop_outer - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) .Labort: AARCH64_VALIDATE_LINK_REGISTER ret @@ -296,7 +298,7 @@ sub ROUND { sub $out,$out,#1 add $inp,$inp,$len add $out,$out,$len - add $ctr,sp,$len + add $ctrp,PTRN(sp),$len neg $len,$len add @x[0],@x[0],@x[1],lsl#32 // pack @@ -317,31 +319,31 @@ sub ROUND { rev @x[12],@x[12] rev @x[14],@x[14] #endif - stp @x[0],@x[2],[sp,#0] - stp @x[4],@x[6],[sp,#16] - stp @x[8],@x[10],[sp,#32] - stp @x[12],@x[14],[sp,#48] + stp @x[0],@x[2],[PTRN(sp),#0] + stp @x[4],@x[6],[PTRN(sp),#16] + stp @x[8],@x[10],[PTRN(sp),#32] + stp @x[12],@x[14],[PTRN(sp),#48] .Loop_tail: ldrb w10,[$inp,$len] - ldrb w11,[$ctr,$len] + ldrb w11,[$ctrp,$len] add $len,$len,#1 eor w10,w10,w11 strb w10,[$out,$len] cbnz $len,.Loop_tail - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + stp xzr,xzr,[PTRN(sp),#0] + stp xzr,xzr,[PTRN(sp),#16] + stp xzr,xzr,[PTRN(sp),#32] + stp xzr,xzr,[PTRN(sp),#48] + + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_ctr32,.-ChaCha20_ctr32 @@ -434,29 +436,29 @@ sub NEON_lane_ROUND { ChaCha20_neon: AARCH64_SIGN_LINK_REGISTER .LChaCha20_neon: - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr @x[0],.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + + adr @p[0],.Lsigma + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] cmp $len,#512 b.hs .L512_or_more_neon - sub sp,sp,#64 + sub PTRN(sp),PTRN(sp),#64 - ldp @d[0],@d[1],[@x[0]] // load sigma - ld1 {@K[0]},[@x[0]],#16 + ldp @d[0],@d[1],[@p[0]] // load sigma + ld1 {@K[0]},[@p[0]],#16 ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ld1 {@K[1],@K[2]},[$key] - ldp @d[6],@d[7],[$ctr] // load counter - ld1 {@K[3]},[$ctr] - stp d8,d9,[sp] // meet ABI requirements - ld1 {$CTR,$ROT24},[@x[0]] + ldp @d[6],@d[7],[$ctrp] // load counter + ld1 {@K[3]},[$ctrp] + stp d8,d9,[PTRN(sp)] // meet ABI requirements + ld1 {$CTR,$ROT24},[@p[0]] #ifdef __AARCH64EB__ rev64 @K[0],@K[0] ror @d[2],@d[2],#32 @@ -658,22 +660,22 @@ sub NEON_lane_ROUND { b.hi .Loop_outer_neon - ldp d8,d9,[sp] // meet ABI requirements + ldp d8,d9,[PTRN(sp)] // meet ABI requirements - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .align 4 .Ltail_neon: add $len,$len,#320 - ldp d8,d9,[sp] // meet ABI requirements + ldp d8,d9,[PTRN(sp)] // meet ABI requirements cmp $len,#64 b.lo .Less_than_64 @@ -770,35 +772,35 @@ sub NEON_lane_ROUND { sub $len,$len,#64 .Last_neon: - st1.8 {$xa0-$xd0},[sp] + st1.8 {$xa0-$xd0},[PTRN(sp)] sub $out,$out,#1 add $inp,$inp,$len add $out,$out,$len - add $ctr,sp,$len + add $ctrp,PTRN(sp),$len neg $len,$len .Loop_tail_neon: ldrb w10,[$inp,$len] - ldrb w11,[$ctr,$len] + ldrb w11,[$ctrp,$len] add $len,$len,#1 eor w10,w10,w11 strb w10,[$out,$len] cbnz $len,.Loop_tail_neon - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] + stp xzr,xzr,[PTRN(sp),#0] + stp xzr,xzr,[PTRN(sp),#16] + stp xzr,xzr,[PTRN(sp),#32] + stp xzr,xzr,[PTRN(sp),#48] .Ldone_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_neon,.-ChaCha20_neon @@ -845,29 +847,29 @@ sub NEONROUND { .align 5 ChaCha20_512_neon: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - adr @x[0],.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] + adr @p[0],.Lsigma + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] .L512_or_more_neon: - sub sp,sp,#128+64 + sub PTRN(sp),PTRN(sp),#128+64 eor $ONE,$ONE,$ONE - ldp @d[0],@d[1],[@x[0]] // load sigma - ld1 {@K[0]},[@x[0]],#16 + ldp @d[0],@d[1],[@p[0]] // load sigma + ld1 {@K[0]},[@p[0]],#16 ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ld1 {@K[1],@K[2]},[$key] - ldp @d[6],@d[7],[$ctr] // load counter - ld1 {@K[3]},[$ctr] - ld1 {$ONE}[0],[@x[0]] - add $key,@x[0],#16 // .Lrot24 + ldp @d[6],@d[7],[$ctrp] // load counter + ld1 {@K[3]},[$ctrp] + ld1 {$ONE}[0],[@p[0]] + add $key,@p[0],#16 // .Lrot24 #ifdef __AARCH64EB__ rev64 @K[0],@K[0] ror @d[2],@d[2],#32 @@ -878,18 +880,18 @@ sub NEONROUND { ror @d[7],@d[7],#32 #endif add @K[3],@K[3],$ONE // += 1 - stp @K[0],@K[1],[sp,#0] // off-load key block, invariant part + stp @K[0],@K[1],[PTRN(sp),#0] // off-load key block, invariant part add @K[3],@K[3],$ONE // not typo - str @K[2],[sp,#32] + str @K[2],[PTRN(sp),#32] add @K[4],@K[3],$ONE add @K[5],@K[4],$ONE add @K[6],@K[5],$ONE shl $ONE,$ONE,#2 // 1 -> 4 - stp d8,d9,[sp,#128+0] // meet ABI requirements - stp d10,d11,[sp,#128+16] - stp d12,d13,[sp,#128+32] - stp d14,d15,[sp,#128+48] + stp d8,d9,[PTRN(sp),#128+0] // meet ABI requirements + stp d10,d11,[PTRN(sp),#128+16] + stp d12,d13,[PTRN(sp),#128+32] + stp d14,d15,[PTRN(sp),#128+48] sub $len,$len,#512 // not typo @@ -933,9 +935,9 @@ sub NEONROUND { mov $C3,@K[2] lsr @x[15],@d[7],#32 mov $C4,@K[2] - stp @K[3],@K[4],[sp,#48] // off-load key block, variable part + stp @K[3],@K[4],[PTRN(sp),#48] // off-load key block, variable part mov $C5,@K[2] - stp @K[5],@K[6],[sp,#80] + stp @K[5],@K[6],[PTRN(sp),#80] mov $ctr,#5 ld1 {$rot24},[$key] @@ -1094,13 +1096,13 @@ sub NEONROUND { cbnz $ctr,.Loop_lower_neon add.32 @x[0],@x[0],@d[0] // accumulate key block - ldp @K[0],@K[1],[sp,#0] + ldp @K[0],@K[1],[PTRN(sp),#0] add @x[1],@x[1],@d[0],lsr#32 - ldp @K[2],@K[3],[sp,#32] + ldp @K[2],@K[3],[PTRN(sp),#32] add.32 @x[2],@x[2],@d[1] - ldp @K[4],@K[5],[sp,#64] + ldp @K[4],@K[5],[PTRN(sp),#64] add @x[3],@x[3],@d[1],lsr#32 - ldr @K[6],[sp,#96] + ldr @K[6],[PTRN(sp),#96] add $A0,$A0,@K[0] add.32 @x[4],@x[4],@d[2] add $A1,$A1,@K[0] @@ -1195,9 +1197,9 @@ sub NEONROUND { ld1.8 {$A1-$D1},[$inp],#64 eor $A2,$A2,$A0 - ldp @K[0],@K[1],[sp,#0] + ldp @K[0],@K[1],[PTRN(sp),#0] eor $B2,$B2,$B0 - ldp @K[2],@K[3],[sp,#32] + ldp @K[2],@K[3],[PTRN(sp),#32] eor $C2,$C2,$C0 eor $D2,$D2,$D0 st1.8 {$A2-$D2},[$out],#64 @@ -1233,24 +1235,24 @@ sub NEONROUND { adds $len,$len,#512 ushr $ONE,$ONE,#1 // 4 -> 2 - ldp d10,d11,[sp,#128+16] // meet ABI requirements - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] + ldp d10,d11,[PTRN(sp),#128+16] // meet ABI requirements + ldp d12,d13,[PTRN(sp),#128+32] + ldp d14,d15,[PTRN(sp),#128+48] - stp @K[0],@K[0],[sp,#0] // wipe off-load area - stp @K[0],@K[0],[sp,#32] - stp @K[0],@K[0],[sp,#64] + stp @K[0],@K[0],[PTRN(sp),#0] // wipe off-load area + stp @K[0],@K[0],[PTRN(sp),#32] + stp @K[0],@K[0],[PTRN(sp),#64] b.eq .Ldone_512_neon sub $key,$key,#16 // .Lone cmp $len,#192 - add sp,sp,#128 + add PTRN(sp),PTRN(sp),#128 sub @K[3],@K[3],$ONE // -= 2 ld1 {$CTR,$ROT24},[$key] b.hs .Loop_outer_neon - ldp d8,d9,[sp,#0] // meet ABI requirements + ldp d8,d9,[PTRN(sp),#0] // meet ABI requirements eor @K[1],@K[1],@K[1] eor @K[2],@K[2],@K[2] eor @K[3],@K[3],@K[3] @@ -1260,14 +1262,14 @@ sub NEONROUND { b .Loop_outer .Ldone_512_neon: - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + ldp d8,d9,[PTRN(sp),#128+0] // meet ABI requirements + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#128+64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_512_neon,.-ChaCha20_512_neon From 85b582b811f2a45c53a56abe84e10fea39d29ac7 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 15 May 2024 13:34:25 -0700 Subject: [PATCH 08/19] OpenSSL ecp_nistz256-armv8.pl: Add purecap support --- .../crypto/ec/asm/ecp_nistz256-armv8.pl | 552 +++++++++--------- 1 file changed, 278 insertions(+), 274 deletions(-) diff --git a/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl index 6c5d0e8b3cf0..9a9114e37bbb 100755 --- a/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl +++ b/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -46,11 +46,12 @@ *STDOUT=*OUT; { -my ($rp,$ap,$bp,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, +my ($rpx,$apx,$bpx,$bi,$a0,$a1,$a2,$a3,$t0,$t1,$t2,$t3,$poly1,$poly3, $acc0,$acc1,$acc2,$acc3,$acc4,$acc5) = map("x$_",(0..17,19,20)); +my ($rp,$ap,$bp) = map("PTR($_)",(0..2)); -my ($acc6,$acc7)=($ap,$bp); # used in __ecp_nistz256_sqr_mont +my ($acc6,$acc7)=($apx,$bpx); # used in __ecp_nistz256_sqr_mont $code.=<<___; #include "arm_arch.h" @@ -123,9 +124,9 @@ .align 6 ecp_nistz256_to_mont: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] + stp PTR(29),PTR(30),[PTRN(sp),#-(4*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] ldr $bi,.LRR // bp[0] ldp $a0,$a1,[$ap] @@ -136,8 +137,8 @@ bl __ecp_nistz256_mul_mont - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(4*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont @@ -148,9 +149,9 @@ .align 4 ecp_nistz256_from_mont: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] + stp PTR(29),PTR(30),[PTRN(sp),#-(4*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] mov $bi,#1 // bp[0] ldp $a0,$a1,[$ap] @@ -161,8 +162,8 @@ bl __ecp_nistz256_mul_mont - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(4*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont @@ -174,9 +175,9 @@ .align 4 ecp_nistz256_mul_mont: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] + stp PTR(29),PTR(30),[PTRN(sp),#-(4*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] ldr $bi,[$bp] // bp[0] ldp $a0,$a1,[$ap] @@ -186,8 +187,8 @@ bl __ecp_nistz256_mul_mont - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(4*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont @@ -198,9 +199,9 @@ .align 4 ecp_nistz256_sqr_mont: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] + stp PTR(29),PTR(30),[PTRN(sp),#-(4*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] @@ -209,8 +210,8 @@ bl __ecp_nistz256_sqr_mont - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(4*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont @@ -222,8 +223,8 @@ .align 4 ecp_nistz256_add: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ldp $acc0,$acc1,[$ap] ldp $t0,$t1,[$bp] @@ -234,7 +235,7 @@ bl __ecp_nistz256_add - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_add,.-ecp_nistz256_add @@ -245,8 +246,8 @@ .align 4 ecp_nistz256_div_by_2: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ldp $acc0,$acc1,[$ap] ldp $acc2,$acc3,[$ap,#16] @@ -255,7 +256,7 @@ bl __ecp_nistz256_div_by_2 - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 @@ -266,8 +267,8 @@ .align 4 ecp_nistz256_mul_by_2: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ldp $acc0,$acc1,[$ap] ldp $acc2,$acc3,[$ap,#16] @@ -280,7 +281,7 @@ bl __ecp_nistz256_add // ret = a+a // 2*a - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 @@ -291,8 +292,8 @@ .align 4 ecp_nistz256_mul_by_3: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ldp $acc0,$acc1,[$ap] ldp $acc2,$acc3,[$ap,#16] @@ -316,7 +317,7 @@ bl __ecp_nistz256_add // ret += a // 2*a+a=3*a - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 @@ -328,8 +329,8 @@ .align 4 ecp_nistz256_sub: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ldp $acc0,$acc1,[$ap] ldp $acc2,$acc3,[$ap,#16] @@ -338,7 +339,7 @@ bl __ecp_nistz256_sub_from - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sub,.-ecp_nistz256_sub @@ -349,8 +350,8 @@ .align 4 ecp_nistz256_neg: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 mov $bp,$ap mov $acc0,xzr // a = 0 @@ -362,7 +363,7 @@ bl __ecp_nistz256_sub_from - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_neg,.-ecp_nistz256_neg @@ -604,13 +605,13 @@ adcs $acc1,$acc1,$t1 adcs $acc2,$acc2,$t2 adcs $acc3,$acc3,$t3 - adc $ap,xzr,xzr // zap $ap + adc $apx,xzr,xzr // zap $ap adds $t0,$acc0,#1 // subs $t0,$a0,#-1 // tmp = ret-modulus sbcs $t1,$acc1,$poly1 sbcs $t2,$acc2,xzr sbcs $t3,$acc3,$poly3 - sbcs xzr,$ap,xzr // did subtraction borrow? + sbcs xzr,$apx,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,lo // ret = borrow ? ret : ret-modulus csel $acc1,$acc1,$t1,lo @@ -631,13 +632,13 @@ sbcs $acc1,$acc1,$t1 sbcs $acc2,$acc2,$t2 sbcs $acc3,$acc3,$t3 - sbc $ap,xzr,xzr // zap $ap + sbc $apx,xzr,xzr // zap $ap subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adc $t3,$acc3,$poly3 - cmp $ap,xzr // did subtraction borrow? + cmp $apx,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret csel $acc1,$acc1,$t1,eq @@ -658,13 +659,13 @@ sbcs $acc1,$t1,$acc1 sbcs $acc2,$t2,$acc2 sbcs $acc3,$t3,$acc3 - sbc $ap,xzr,xzr // zap $ap + sbc $apx,xzr,xzr // zap $ap subs $t0,$acc0,#1 // adds $t0,$a0,#-1 // tmp = ret+modulus adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adc $t3,$acc3,$poly3 - cmp $ap,xzr // did subtraction borrow? + cmp $apx,xzr // did subtraction borrow? csel $acc0,$acc0,$t0,eq // ret = borrow ? ret+modulus : ret csel $acc1,$acc1,$t1,eq @@ -683,14 +684,14 @@ adcs $t1,$acc1,$poly1 adcs $t2,$acc2,xzr adcs $t3,$acc3,$poly3 - adc $ap,xzr,xzr // zap $ap + adc $apx,xzr,xzr // zap $ap tst $acc0,#1 // is a even? csel $acc0,$acc0,$t0,eq // ret = even ? a : a+modulus csel $acc1,$acc1,$t1,eq csel $acc2,$acc2,$t2,eq csel $acc3,$acc3,$t3,eq - csel $ap,xzr,$ap,eq + csel $apx,xzr,$apx,eq lsr $acc0,$acc0,#1 // ret >>= 1 orr $acc0,$acc0,$acc1,lsl#63 @@ -700,7 +701,7 @@ orr $acc2,$acc2,$acc3,lsl#63 lsr $acc3,$acc3,#1 stp $acc0,$acc1,[$rp] - orr $acc3,$acc3,$ap,lsl#63 + orr $acc3,$acc3,$apx,lsl#63 stp $acc2,$acc3,[$rp,#16] ret @@ -717,7 +718,7 @@ my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3)); # above map() describes stack layout with 4 temporary # 256-bit vectors on top. -my ($rp_real,$ap_real) = map("x$_",(21,22)); +my ($rp_real,$ap_real) = map("PTR($_)",(21,22)); $code.=<<___; .globl ecp_nistz256_point_double @@ -725,11 +726,11 @@ .align 5 ecp_nistz256_point_double: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - sub sp,sp,#32*4 + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#32*4 .Ldouble_shortcut: ldp $acc0,$acc1,[$ap,#32] @@ -744,10 +745,10 @@ mov $t2,$acc2 mov $t3,$acc3 ldp $a2,$a3,[$ap_real,#64+16] - add $rp,sp,#$S + add $rp,PTRN(sp),#$S bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); - add $rp,sp,#$Zsqr + add $rp,PTRN(sp),#$Zsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); ldp $t0,$t1,[$ap_real] @@ -756,49 +757,49 @@ mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 - add $rp,sp,#$M + add $rp,PTRN(sp),#$M bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); add $bp,$ap_real,#0 mov $acc0,$a0 // restore Zsqr mov $acc1,$a1 - ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + ldp $a0,$a1,[PTRN(sp),#$S] // forward load for p256_sqr_mont mov $acc2,$a2 mov $acc3,$a3 - ldp $a2,$a3,[sp,#$S+16] - add $rp,sp,#$Zsqr + ldp $a2,$a3,[PTRN(sp),#$S+16] + add $rp,PTRN(sp),#$Zsqr bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); - add $rp,sp,#$S + add $rp,PTRN(sp),#$S bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); ldr $bi,[$ap_real,#32] ldp $a0,$a1,[$ap_real,#64] ldp $a2,$a3,[$ap_real,#64+16] add $bp,$ap_real,#32 - add $rp,sp,#$tmp0 + add $rp,PTRN(sp),#$tmp0 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); mov $t0,$acc0 mov $t1,$acc1 - ldp $a0,$a1,[sp,#$S] // forward load for p256_sqr_mont + ldp $a0,$a1,[PTRN(sp),#$S] // forward load for p256_sqr_mont mov $t2,$acc2 mov $t3,$acc3 - ldp $a2,$a3,[sp,#$S+16] + ldp $a2,$a3,[PTRN(sp),#$S+16] add $rp,$rp_real,#64 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); - add $rp,sp,#$tmp0 + add $rp,PTRN(sp),#$tmp0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); - ldr $bi,[sp,#$Zsqr] // forward load for p256_mul_mont - ldp $a0,$a1,[sp,#$M] - ldp $a2,$a3,[sp,#$M+16] + ldr $bi,[PTRN(sp),#$Zsqr] // forward load for p256_mul_mont + ldp $a0,$a1,[PTRN(sp),#$M] + ldp $a2,$a3,[PTRN(sp),#$M+16] add $rp,$rp_real,#32 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); - add $bp,sp,#$Zsqr - add $rp,sp,#$M + add $bp,PTRN(sp),#$Zsqr + add $rp,PTRN(sp),#$M bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); mov $t0,$acc0 // duplicate M @@ -809,56 +810,56 @@ mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 - add $rp,sp,#$M + add $rp,PTRN(sp),#$M bl __ecp_nistz256_add mov $t0,$a0 // restore M mov $t1,$a1 ldr $bi,[$ap_real] // forward load for p256_mul_mont mov $t2,$a2 - ldp $a0,$a1,[sp,#$S] + ldp $a0,$a1,[PTRN(sp),#$S] mov $t3,$a3 - ldp $a2,$a3,[sp,#$S+16] + ldp $a2,$a3,[PTRN(sp),#$S+16] bl __ecp_nistz256_add // p256_mul_by_3(M, M); add $bp,$ap_real,#0 - add $rp,sp,#$S + add $rp,PTRN(sp),#$S bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); mov $t0,$acc0 mov $t1,$acc1 - ldp $a0,$a1,[sp,#$M] // forward load for p256_sqr_mont + ldp $a0,$a1,[PTRN(sp),#$M] // forward load for p256_sqr_mont mov $t2,$acc2 mov $t3,$acc3 - ldp $a2,$a3,[sp,#$M+16] - add $rp,sp,#$tmp0 + ldp $a2,$a3,[PTRN(sp),#$M+16] + add $rp,PTRN(sp),#$tmp0 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); add $rp,$rp_real,#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); - add $bp,sp,#$tmp0 + add $bp,PTRN(sp),#$tmp0 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); - add $bp,sp,#$S - add $rp,sp,#$S + add $bp,PTRN(sp),#$S + add $rp,PTRN(sp),#$S bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); - ldr $bi,[sp,#$M] + ldr $bi,[PTRN(sp),#$M] mov $a0,$acc0 // copy S mov $a1,$acc1 mov $a2,$acc2 mov $a3,$acc3 - add $bp,sp,#$M + add $bp,PTRN(sp),#$M bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); add $bp,$rp_real,#32 add $rp,$rp_real,#32 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x29,x30,[sp],#96 + add PTRN(sp),PTR(29),#0 // destroy frame + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double @@ -875,7 +876,8 @@ my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); # above map() describes stack layout with 12 temporary # 256-bit vectors on top. -my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(21..28)); +my ($rp_real,$ap_real,$bp_real)=map("PTR($_)",(21..23)); +my ($in1infty,$in2infty,$temp0,$temp1,$temp2)=map("x$_",(24..28)); $code.=<<___; .globl ecp_nistz256_point_add @@ -883,14 +885,14 @@ .align 5 ecp_nistz256_point_add: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#32*12 + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#32*12 ldp $a0,$a1,[$bp,#64] // in2_z ldp $a2,$a3,[$bp,#64+16] @@ -904,7 +906,7 @@ orr $in2infty,$t0,$t2 cmp $in2infty,#0 csetm $in2infty,ne // ~in2infty - add $rp,sp,#$Z2sqr + add $rp,PTRN(sp),#$Z2sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); ldp $a0,$a1,[$ap_real,#64] // in1_z @@ -914,63 +916,63 @@ orr $in1infty,$t0,$t2 cmp $in1infty,#0 csetm $in1infty,ne // ~in1infty - add $rp,sp,#$Z1sqr + add $rp,PTRN(sp),#$Z1sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); ldr $bi,[$bp_real,#64] - ldp $a0,$a1,[sp,#$Z2sqr] - ldp $a2,$a3,[sp,#$Z2sqr+16] + ldp $a0,$a1,[PTRN(sp),#$Z2sqr] + ldp $a2,$a3,[PTRN(sp),#$Z2sqr+16] add $bp,$bp_real,#64 - add $rp,sp,#$S1 + add $rp,PTRN(sp),#$S1 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); ldr $bi,[$ap_real,#64] - ldp $a0,$a1,[sp,#$Z1sqr] - ldp $a2,$a3,[sp,#$Z1sqr+16] + ldp $a0,$a1,[PTRN(sp),#$Z1sqr] + ldp $a2,$a3,[PTRN(sp),#$Z1sqr+16] add $bp,$ap_real,#64 - add $rp,sp,#$S2 + add $rp,PTRN(sp),#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr $bi,[$ap_real,#32] - ldp $a0,$a1,[sp,#$S1] - ldp $a2,$a3,[sp,#$S1+16] + ldp $a0,$a1,[PTRN(sp),#$S1] + ldp $a2,$a3,[PTRN(sp),#$S1+16] add $bp,$ap_real,#32 - add $rp,sp,#$S1 + add $rp,PTRN(sp),#$S1 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); ldr $bi,[$bp_real,#32] - ldp $a0,$a1,[sp,#$S2] - ldp $a2,$a3,[sp,#$S2+16] + ldp $a0,$a1,[PTRN(sp),#$S2] + ldp $a2,$a3,[PTRN(sp),#$S2+16] add $bp,$bp_real,#32 - add $rp,sp,#$S2 + add $rp,PTRN(sp),#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); - add $bp,sp,#$S1 - ldr $bi,[sp,#$Z2sqr] // forward load for p256_mul_mont + add $bp,PTRN(sp),#$S1 + ldr $bi,[PTRN(sp),#$Z2sqr] // forward load for p256_mul_mont ldp $a0,$a1,[$ap_real] ldp $a2,$a3,[$ap_real,#16] - add $rp,sp,#$R + add $rp,PTRN(sp),#$R bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); orr $acc0,$acc0,$acc1 // see if result is zero orr $acc2,$acc2,$acc3 orr $temp0,$acc0,$acc2 // ~is_equal(S1,S2) - add $bp,sp,#$Z2sqr - add $rp,sp,#$U1 + add $bp,PTRN(sp),#$Z2sqr + add $rp,PTRN(sp),#$U1 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); - ldr $bi,[sp,#$Z1sqr] + ldr $bi,[PTRN(sp),#$Z1sqr] ldp $a0,$a1,[$bp_real] ldp $a2,$a3,[$bp_real,#16] - add $bp,sp,#$Z1sqr - add $rp,sp,#$U2 + add $bp,PTRN(sp),#$Z1sqr + add $rp,PTRN(sp),#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); - add $bp,sp,#$U1 - ldp $a0,$a1,[sp,#$R] // forward load for p256_sqr_mont - ldp $a2,$a3,[sp,#$R+16] - add $rp,sp,#$H + add $bp,PTRN(sp),#$U1 + ldp $a0,$a1,[PTRN(sp),#$R] // forward load for p256_sqr_mont + ldp $a2,$a3,[PTRN(sp),#$R+16] + add $rp,PTRN(sp),#$H bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); orr $acc0,$acc0,$acc1 // see if result is zero @@ -987,87 +989,87 @@ .Ladd_double: mov $ap,$ap_real mov $rp,$rp_real - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - add sp,sp,#32*(12-4) // difference in stack frames + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#32*(12-4) // difference in stack frames b .Ldouble_shortcut .align 4 .Ladd_proceed: - add $rp,sp,#$Rsqr + add $rp,PTRN(sp),#$Rsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); ldr $bi,[$ap_real,#64] - ldp $a0,$a1,[sp,#$H] - ldp $a2,$a3,[sp,#$H+16] + ldp $a0,$a1,[PTRN(sp),#$H] + ldp $a2,$a3,[PTRN(sp),#$H+16] add $bp,$ap_real,#64 - add $rp,sp,#$res_z + add $rp,PTRN(sp),#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); - ldp $a0,$a1,[sp,#$H] - ldp $a2,$a3,[sp,#$H+16] - add $rp,sp,#$Hsqr + ldp $a0,$a1,[PTRN(sp),#$H] + ldp $a2,$a3,[PTRN(sp),#$H+16] + add $rp,PTRN(sp),#$Hsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); ldr $bi,[$bp_real,#64] - ldp $a0,$a1,[sp,#$res_z] - ldp $a2,$a3,[sp,#$res_z+16] + ldp $a0,$a1,[PTRN(sp),#$res_z] + ldp $a2,$a3,[PTRN(sp),#$res_z+16] add $bp,$bp_real,#64 - add $rp,sp,#$res_z + add $rp,PTRN(sp),#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); - ldr $bi,[sp,#$H] - ldp $a0,$a1,[sp,#$Hsqr] - ldp $a2,$a3,[sp,#$Hsqr+16] - add $bp,sp,#$H - add $rp,sp,#$Hcub + ldr $bi,[PTRN(sp),#$H] + ldp $a0,$a1,[PTRN(sp),#$Hsqr] + ldp $a2,$a3,[PTRN(sp),#$Hsqr+16] + add $bp,PTRN(sp),#$H + add $rp,PTRN(sp),#$Hcub bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); - ldr $bi,[sp,#$Hsqr] - ldp $a0,$a1,[sp,#$U1] - ldp $a2,$a3,[sp,#$U1+16] - add $bp,sp,#$Hsqr - add $rp,sp,#$U2 + ldr $bi,[PTRN(sp),#$Hsqr] + ldp $a0,$a1,[PTRN(sp),#$U1] + ldp $a2,$a3,[PTRN(sp),#$U1+16] + add $bp,PTRN(sp),#$Hsqr + add $rp,PTRN(sp),#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); mov $t0,$acc0 mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 - add $rp,sp,#$Hsqr + add $rp,PTRN(sp),#$Hsqr bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); - add $bp,sp,#$Rsqr - add $rp,sp,#$res_x + add $bp,PTRN(sp),#$Rsqr + add $rp,PTRN(sp),#$res_x bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); - add $bp,sp,#$Hcub + add $bp,PTRN(sp),#$Hcub bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); - add $bp,sp,#$U2 - ldr $bi,[sp,#$Hcub] // forward load for p256_mul_mont - ldp $a0,$a1,[sp,#$S1] - ldp $a2,$a3,[sp,#$S1+16] - add $rp,sp,#$res_y + add $bp,PTRN(sp),#$U2 + ldr $bi,[PTRN(sp),#$Hcub] // forward load for p256_mul_mont + ldp $a0,$a1,[PTRN(sp),#$S1] + ldp $a2,$a3,[PTRN(sp),#$S1+16] + add $rp,PTRN(sp),#$res_y bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); - add $bp,sp,#$Hcub - add $rp,sp,#$S2 + add $bp,PTRN(sp),#$Hcub + add $rp,PTRN(sp),#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); - ldr $bi,[sp,#$R] - ldp $a0,$a1,[sp,#$res_y] - ldp $a2,$a3,[sp,#$res_y+16] - add $bp,sp,#$R - add $rp,sp,#$res_y + ldr $bi,[PTRN(sp),#$R] + ldp $a0,$a1,[PTRN(sp),#$res_y] + ldp $a2,$a3,[PTRN(sp),#$res_y+16] + add $bp,PTRN(sp),#$R + add $rp,PTRN(sp),#$res_y bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); - add $bp,sp,#$S2 + add $bp,PTRN(sp),#$S2 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); - ldp $a0,$a1,[sp,#$res_x] // res - ldp $a2,$a3,[sp,#$res_x+16] + ldp $a0,$a1,[PTRN(sp),#$res_x] // res + ldp $a2,$a3,[PTRN(sp),#$res_x+16] ldp $t0,$t1,[$bp_real] // in2 ldp $t2,$t3,[$bp_real,#16] ___ @@ -1078,11 +1080,11 @@ ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne - ldp $a0,$a1,[sp,#$res_x+$i+32] // res + ldp $a0,$a1,[PTRN(sp),#$res_x+$i+32] // res csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? - ldp $a2,$a3,[sp,#$res_x+$i+48] + ldp $a2,$a3,[PTRN(sp),#$res_x+$i+48] csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne ldp $t0,$t1,[$bp_real,#$i+32] // in2 @@ -1110,13 +1112,13 @@ stp $acc2,$acc3,[$rp_real,#$i+16] .Ladd_done: - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + add PTRN(sp),PTR(29),#0 // destroy frame + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add,.-ecp_nistz256_point_add @@ -1132,7 +1134,8 @@ my $Z1sqr = $S2; # above map() describes stack layout with 10 temporary # 256-bit vectors on top. -my ($rp_real,$ap_real,$bp_real,$in1infty,$in2infty,$temp)=map("x$_",(21..26)); +my ($rp_real,$ap_real,$bp_real)=map("PTR($_)",(21..23)); +my ($in1infty,$in2infty,$temp)=map("x$_",(24..26)); $code.=<<___; .globl ecp_nistz256_point_add_affine @@ -1140,13 +1143,13 @@ .align 5 ecp_nistz256_point_add_affine: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - sub sp,sp,#32*10 + stp PTR(29),PTR(30),[PTRN(sp),#-(10*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#32*10 mov $rp_real,$rp mov $ap_real,$ap @@ -1176,7 +1179,7 @@ cmp $in2infty,#0 csetm $in2infty,ne // ~in2infty - add $rp,sp,#$Z1sqr + add $rp,PTRN(sp),#$Z1sqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); mov $a0,$acc0 @@ -1185,99 +1188,99 @@ mov $a3,$acc3 ldr $bi,[$bp_real] add $bp,$bp_real,#0 - add $rp,sp,#$U2 + add $rp,PTRN(sp),#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); add $bp,$ap_real,#0 ldr $bi,[$ap_real,#64] // forward load for p256_mul_mont - ldp $a0,$a1,[sp,#$Z1sqr] - ldp $a2,$a3,[sp,#$Z1sqr+16] - add $rp,sp,#$H + ldp $a0,$a1,[PTRN(sp),#$Z1sqr] + ldp $a2,$a3,[PTRN(sp),#$Z1sqr+16] + add $rp,PTRN(sp),#$H bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); add $bp,$ap_real,#64 - add $rp,sp,#$S2 + add $rp,PTRN(sp),#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); ldr $bi,[$ap_real,#64] - ldp $a0,$a1,[sp,#$H] - ldp $a2,$a3,[sp,#$H+16] + ldp $a0,$a1,[PTRN(sp),#$H] + ldp $a2,$a3,[PTRN(sp),#$H+16] add $bp,$ap_real,#64 - add $rp,sp,#$res_z + add $rp,PTRN(sp),#$res_z bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); ldr $bi,[$bp_real,#32] - ldp $a0,$a1,[sp,#$S2] - ldp $a2,$a3,[sp,#$S2+16] + ldp $a0,$a1,[PTRN(sp),#$S2] + ldp $a2,$a3,[PTRN(sp),#$S2+16] add $bp,$bp_real,#32 - add $rp,sp,#$S2 + add $rp,PTRN(sp),#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); add $bp,$ap_real,#32 - ldp $a0,$a1,[sp,#$H] // forward load for p256_sqr_mont - ldp $a2,$a3,[sp,#$H+16] - add $rp,sp,#$R + ldp $a0,$a1,[PTRN(sp),#$H] // forward load for p256_sqr_mont + ldp $a2,$a3,[PTRN(sp),#$H+16] + add $rp,PTRN(sp),#$R bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); - add $rp,sp,#$Hsqr + add $rp,PTRN(sp),#$Hsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); - ldp $a0,$a1,[sp,#$R] - ldp $a2,$a3,[sp,#$R+16] - add $rp,sp,#$Rsqr + ldp $a0,$a1,[PTRN(sp),#$R] + ldp $a2,$a3,[PTRN(sp),#$R+16] + add $rp,PTRN(sp),#$Rsqr bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); - ldr $bi,[sp,#$H] - ldp $a0,$a1,[sp,#$Hsqr] - ldp $a2,$a3,[sp,#$Hsqr+16] - add $bp,sp,#$H - add $rp,sp,#$Hcub + ldr $bi,[PTRN(sp),#$H] + ldp $a0,$a1,[PTRN(sp),#$Hsqr] + ldp $a2,$a3,[PTRN(sp),#$Hsqr+16] + add $bp,PTRN(sp),#$H + add $rp,PTRN(sp),#$Hcub bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); ldr $bi,[$ap_real] - ldp $a0,$a1,[sp,#$Hsqr] - ldp $a2,$a3,[sp,#$Hsqr+16] + ldp $a0,$a1,[PTRN(sp),#$Hsqr] + ldp $a2,$a3,[PTRN(sp),#$Hsqr+16] add $bp,$ap_real,#0 - add $rp,sp,#$U2 + add $rp,PTRN(sp),#$U2 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); mov $t0,$acc0 mov $t1,$acc1 mov $t2,$acc2 mov $t3,$acc3 - add $rp,sp,#$Hsqr + add $rp,PTRN(sp),#$Hsqr bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); - add $bp,sp,#$Rsqr - add $rp,sp,#$res_x + add $bp,PTRN(sp),#$Rsqr + add $rp,PTRN(sp),#$res_x bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); - add $bp,sp,#$Hcub + add $bp,PTRN(sp),#$Hcub bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); - add $bp,sp,#$U2 + add $bp,PTRN(sp),#$U2 ldr $bi,[$ap_real,#32] // forward load for p256_mul_mont - ldp $a0,$a1,[sp,#$Hcub] - ldp $a2,$a3,[sp,#$Hcub+16] - add $rp,sp,#$res_y + ldp $a0,$a1,[PTRN(sp),#$Hcub] + ldp $a2,$a3,[PTRN(sp),#$Hcub+16] + add $rp,PTRN(sp),#$res_y bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); add $bp,$ap_real,#32 - add $rp,sp,#$S2 + add $rp,PTRN(sp),#$S2 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); - ldr $bi,[sp,#$R] - ldp $a0,$a1,[sp,#$res_y] - ldp $a2,$a3,[sp,#$res_y+16] - add $bp,sp,#$R - add $rp,sp,#$res_y + ldr $bi,[PTRN(sp),#$R] + ldp $a0,$a1,[PTRN(sp),#$res_y] + ldp $a2,$a3,[PTRN(sp),#$res_y+16] + add $bp,PTRN(sp),#$R + add $rp,PTRN(sp),#$res_y bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); - add $bp,sp,#$S2 + add $bp,PTRN(sp),#$S2 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); - ldp $a0,$a1,[sp,#$res_x] // res - ldp $a2,$a3,[sp,#$res_x+16] + ldp $a0,$a1,[PTRN(sp),#$res_x] // res + ldp $a2,$a3,[PTRN(sp),#$res_x+16] ldp $t0,$t1,[$bp_real] // in2 ldp $t2,$t3,[$bp_real,#16] ___ @@ -1288,11 +1291,11 @@ ldp $acc2,$acc3,[$ap_real,#$i+16] csel $t0,$a0,$t0,ne csel $t1,$a1,$t1,ne - ldp $a0,$a1,[sp,#$res_x+$i+32] // res + ldp $a0,$a1,[PTRN(sp),#$res_x+$i+32] // res csel $t2,$a2,$t2,ne csel $t3,$a3,$t3,ne cmp $in2infty,#0 // ~$in2intfy, remember? - ldp $a2,$a3,[sp,#$res_x+$i+48] + ldp $a2,$a3,[PTRN(sp),#$res_x+$i+48] csel $acc0,$t0,$acc0,ne csel $acc1,$t1,$acc1,ne ldp $t0,$t1,[$bp_real,#$i+32] // in2 @@ -1322,12 +1325,12 @@ stp $acc0,$acc1,[$rp_real,#$i] stp $acc2,$acc3,[$rp_real,#$i+16] - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x29,x30,[sp],#80 + add PTRN(sp),PTR(29),#0 // destroy frame + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(10*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine @@ -1336,6 +1339,7 @@ if (1) { my ($ord0,$ord1) = ($poly1,$poly3); my ($ord2,$ord3,$ordk,$t4) = map("x$_",(21..24)); +my $ordp = "PTR(23)"; my $acc7 = $bi; $code.=<<___; @@ -1348,20 +1352,20 @@ ecp_nistz256_ord_mul_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] + stp PTR(29),PTR(30),[PTRN(sp),#-(8*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] - adr $ordk,.Lord + adr $ordp,.Lord ldr $bi,[$bp] // bp[0] ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] - ldp $ord0,$ord1,[$ordk,#0] - ldp $ord2,$ord3,[$ordk,#16] - ldr $ordk,[$ordk,#32] + ldp $ord0,$ord1,[$ordp,#0] + ldp $ord2,$ord3,[$ordp,#16] + ldr $ordk,[$ordp,#32] mul $acc0,$a0,$bi // a[0]*b[0] umulh $t0,$a0,$bi @@ -1475,10 +1479,10 @@ csel $acc3,$acc3,$t3,lo stp $acc2,$acc3,[$rp,#16] - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldr x29,[sp],#64 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(8*PTR_WIDTH) ret .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont @@ -1491,19 +1495,19 @@ ecp_nistz256_ord_sqr_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] + stp PTR(29),PTR(30),[PTRN(sp),#-(8*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] - adr $ordk,.Lord + adr $ordp,.Lord ldp $a0,$a1,[$ap] ldp $a2,$a3,[$ap,#16] - ldp $ord0,$ord1,[$ordk,#0] - ldp $ord2,$ord3,[$ordk,#16] - ldr $ordk,[$ordk,#32] + ldp $ord0,$ord1,[$ordp,#0] + ldp $ord2,$ord3,[$ordp,#16] + ldr $ordk,[$ordp,#32] b .Loop_ord_sqr .align 4 @@ -1620,15 +1624,15 @@ csel $a2,$acc2,$t2,lo csel $a3,$acc3,$t3,lo - cbnz $bp,.Loop_ord_sqr + cbnz $bpx,.Loop_ord_sqr stp $a0,$a1,[$rp] stp $a2,$a3,[$rp,#16] - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldr x29,[sp],#64 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(8*PTR_WIDTH) ret .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont ___ @@ -1637,7 +1641,7 @@ ######################################################################## # scatter-gather subroutines { -my ($out,$inp,$index,$mask)=map("x$_",(0..3)); +my ($out,$inp,$index,$mask)=("PTR(0)","PTR(1)","x2","x3"); $code.=<<___; // void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, // int x2); @@ -1647,8 +1651,8 @@ ecp_nistz256_scatter_w5: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 add $out,$out,$index,lsl#2 @@ -1699,7 +1703,7 @@ str w6,[$out,#64*6-4] str w7,[$out,#64*7-4] - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 @@ -1711,8 +1715,8 @@ ecp_nistz256_gather_w5: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 cmp $index,xzr csetm x3,ne @@ -1778,7 +1782,7 @@ stp x4,x5,[$out,#64] // Z stp x6,x7,[$out,#80] - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 @@ -1790,8 +1794,8 @@ ecp_nistz256_scatter_w7: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 add $out,$out,$index mov $index,#64/8 @@ -1824,7 +1828,7 @@ add $out,$out,#64*8 b.ne .Loop_scatter_w7 - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 @@ -1836,8 +1840,8 @@ ecp_nistz256_gather_w7: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 cmp $index,xzr csetm x3,ne @@ -1875,7 +1879,7 @@ str x4,[$out],#8 b.ne .Loop_gather_w7 - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 ___ From 926061686cd82efd45ef89ca50f88ba9b5ca0ebe Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 15 May 2024 13:53:21 -0700 Subject: [PATCH 09/19] OpenSSL ghashv8-armx.pl: Add purecap support --- .../openssl/crypto/modes/asm/ghashv8-armx.pl | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl b/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl index cb7720ae9cfb..31a9886c3c84 100644 --- a/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl +++ b/crypto/openssl/crypto/modes/asm/ghashv8-armx.pl @@ -60,9 +60,9 @@ or die "can't call $xlate: $!"; *STDOUT=*OUT; -$Xi="x0"; # argument block -$Htbl="x1"; -$inp="x2"; +$Xi="PTR(0)"; # argument block +$Htbl="PTR(1)"; +$inp="PTR(2)"; $len="x3"; $inc="x12"; @@ -77,7 +77,12 @@ #if __ARM_MAX_ARCH__>=7 ___ -$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour =~ /64/); +#ifndef __CHERI_PURE_CAPABILITY__ +.arch armv8-a+crypto +#endif +.text +___ $code.=<<___ if ($flavour !~ /64/); .fpu neon #ifdef __thumb2__ @@ -112,7 +117,7 @@ AARCH64_VALID_CALL_TARGET ___ $code.=<<___; - vld1.64 {$t1},[x1] @ load input H + vld1.64 {$t1},[PTR(1)] @ load input H vmov.i8 $xC2,#0xe1 vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 vext.8 $IN,$t1,$t1,#8 @@ -127,7 +132,7 @@ vand $t0,$t0,$t1 vorr $IN,$IN,$t2 @ H<<<=1 veor $H,$IN,$t0 @ twisted H - vst1.64 {$H},[x0],#16 @ store Htable[0] + vst1.64 {$H},[PTR(0)],#16 @ store Htable[0] @ calculate H^2 vext.8 $t0,$H,$H,#8 @ Karatsuba pre-processing @@ -154,7 +159,7 @@ vext.8 $t1,$H2,$H2,#8 @ Karatsuba pre-processing veor $t1,$t1,$H2 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed - vst1.64 {$Hhl-$H2},[x0],#32 @ store Htable[1..2] + vst1.64 {$Hhl-$H2},[PTR(0)],#32 @ store Htable[1..2] ___ if ($flavour =~ /64/) { my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7)); @@ -200,7 +205,7 @@ veor $t0,$t0,$H veor $t1,$t1,$H2 vext.8 $Hhl,$t0,$t1,#8 @ pack Karatsuba pre-processed - vst1.64 {$H-$H2},[x0] @ store Htable[3..5] + vst1.64 {$H-$H2},[PTR(0)] @ store Htable[3..5] ___ } $code.=<<___; From f5e7d4b63c47833b6b10acc4f947c2c04a7bd9b1 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 15 May 2024 15:00:04 -0700 Subject: [PATCH 10/19] OpenSSL aes-gcm-armv8_64.pl: Add purecap support NB: This just fixes pointers operands to be capability registers, it does not yet address out of bounds stores at the end of a buffer when the length is not a multiple of the block size. --- .../crypto/modes/asm/aes-gcm-armv8_64.pl | 331 ++++++++++-------- 1 file changed, 185 insertions(+), 146 deletions(-) diff --git a/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl index ac061f797d32..3041921b8f7b 100755 --- a/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl +++ b/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl @@ -158,15 +158,17 @@ open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; -$input_ptr="x0"; #argument block +$input_ptr="PTR(0)"; #argument block +$input_ptrx="x0"; $bit_length="x1"; -$output_ptr="x2"; -$current_tag="x3"; -$counter="x16"; -$cc="x8"; +$output_ptr="PTR(2)"; +$current_tag="PTR(3)"; +$counter="PTR(16)"; +$cc="PTR(8)"; { -my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my ($end_input_ptrx,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my ($end_input_ptr)=map("PTR($_)",(4)); my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); my ($output_l0,$output_h0)=map("x$_",(6..7)); @@ -229,7 +231,12 @@ #if __ARM_MAX_ARCH__>=8 ___ -$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); +$code.=<<___ if ($flavour =~ /64/); +#ifndef __CHERI_PURE_CAPABILITY__ +.arch armv8-a+crypto +#endif +.text +___ $code.=<<___ if ($flavour !~ /64/); .fpu neon #ifdef __thumb2__ @@ -259,15 +266,15 @@ aes_gcm_enc_128_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L128_enc_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov $counter, PTR(4) + mov $cc, PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ @@ -286,7 +293,12 @@ mov $len, $main_end_input_ptr ld1 {$rk0s}, [$cc], #16 @ load rk0 +#ifdef __CHERI_PURE_CAPABILITY__ + lsr $end_input_ptrx, $bit_length, #3 + add $end_input_ptr, $input_ptr, $end_input_ptrx @ end_input_ptr +#else add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr +#endif sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 lsr $rctr32x, $ctr96_t32x, #32 @@ -382,10 +394,10 @@ trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 - add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + add $main_end_input_ptr, $main_end_input_ptr, $input_ptrx aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 @@ -505,7 +517,7 @@ st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result fmov $ctr2d, $ctr96_b64x @ CTR block 6 - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 8 blocks fmov $ctr2.d[1], $ctr32x @ CTR block 6 rev $ctr32w, $rctr32w @ CTR block 7 @@ -676,7 +688,7 @@ fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 - cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + cmp $input_ptrx, $main_end_input_ptr @ LOOP CONTROL aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid @@ -891,7 +903,7 @@ aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 .L128_enc_tail: @ TAIL - sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + sub $main_end_input_ptr, $end_input_ptrx, $input_ptrx @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 @@ -1115,13 +1127,13 @@ rev64 $acc_lb, $acc_lb mov x0, $len st1 { $acc_l.16b }, [$current_tag] - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L128_enc_ret: @@ -1145,15 +1157,15 @@ aes_gcm_dec_128_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L128_dec_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov $counter, PTR(4) + mov $cc, PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr @@ -1209,7 +1221,12 @@ add $rctr32w, $rctr32w, #1 @ CTR block 3 fmov $ctr3.d[1], $ctr32x @ CTR block 3 +#ifdef __CHERI_PURE_CAPABILITY__ + lsr $end_input_ptrx, $bit_length, #3 + add $end_input_ptr, $input_ptr, $end_input_ptrx @ end_input_ptr +#else add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr +#endif aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 ld1 {$rk3s}, [$cc], #16 @ load rk3 @@ -1285,7 +1302,7 @@ ext $h4b, $h4b, $h4b, #8 #endif trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l - add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + add $main_end_input_ptr, $main_end_input_ptr, $input_ptrx aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 @@ -1311,7 +1328,7 @@ aese $ctr3b, $rk9 @ AES block 3 - round 9 aese $ctr0b, $rk9 @ AES block 0 - round 9 - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr1b, $rk9 @ AES block 1 - round 9 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k @@ -1336,7 +1353,7 @@ mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 8 blocks mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high @@ -1578,7 +1595,7 @@ aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 - cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + cmp $input_ptrx, $main_end_input_ptr @ LOOP CONTROL rev64 $res0b, $res0b @ GHASH block 4k+4 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low @@ -1779,7 +1796,7 @@ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low .L128_dec_tail: @ TAIL - sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + sub $main_end_input_ptr, $end_input_ptrx, $input_ptrx @ main_end_input_ptr is number of bytes left to process ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result @@ -1953,7 +1970,7 @@ eor $res0b, $res0b, $t0.16b @ feed in partial tag - ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite + ldp $end_input_ptrx, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite and $output_h0, $output_h0, $ctr96_b64x @@ -1966,7 +1983,7 @@ pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low - bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes + bic $end_input_ptrx, $end_input_ptrx, $ctr32x @ mask out low existing bytes and $output_l0, $output_l0, $ctr32x #ifndef __AARCH64EB__ @@ -1989,7 +2006,7 @@ eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up - orr $output_l0, $output_l0, $end_input_ptr + orr $output_l0, $output_l0, $end_input_ptrx str $ctr32w, [$counter, #12] @ store the updated counter orr $output_h0, $output_h0, $main_end_input_ptr @@ -2011,13 +2028,13 @@ mov x0, $len st1 { $acc_l.16b }, [$current_tag] - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L128_dec_ret: @@ -2028,7 +2045,8 @@ } { -my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my ($end_input_ptrx,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my $end_input_ptr="PTR(4)"; my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); my ($output_l0,$output_h0)=map("x$_",(6..7)); @@ -2103,15 +2121,15 @@ aes_gcm_enc_192_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L192_enc_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov $counter, PTR(4) + mov $cc, PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ @@ -2285,11 +2303,16 @@ eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k aese $ctr2b, $rk11 @ AES block 2 - round 11 +#ifdef __CHERI_PURE_CAPABILITY__ + lsr $end_input_ptrx, $bit_length, #3 + add $end_input_ptr, $input_ptr, $end_input_ptrx @ end_input_ptr +#else add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr - add $main_end_input_ptr, $main_end_input_ptr, $input_ptr +#endif + add $main_end_input_ptr, $main_end_input_ptr, $input_ptrx aese $ctr1b, $rk11 @ AES block 1 - round 11 - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr0b, $rk11 @ AES block 0 - round 11 add $rctr32w, $rctr32w, #1 @ CTR block 3 @@ -2320,7 +2343,7 @@ rev $input_h1, $input_h1 #endif add $input_ptr, $input_ptr, #64 @ AES input_ptr update - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 8 blocks eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low @@ -2527,7 +2550,7 @@ eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid - cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + cmp $input_ptrx, $main_end_input_ptr @ LOOP CONTROL fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 @@ -2786,7 +2809,7 @@ eor $acc_lb, $acc_lb, $acc_mb .L192_enc_tail: @ TAIL - sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + sub $main_end_input_ptr, $end_input_ptrx, $input_ptrx @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 @@ -3010,13 +3033,13 @@ mov x0, $len st1 { $acc_l.16b }, [$current_tag] - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L192_enc_ret: @@ -3040,17 +3063,22 @@ aes_gcm_dec_192_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L192_dec_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] - + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov $counter, PTR(4) + mov $cc, PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + +#ifdef __CHERI_PURE_CAPABILITY__ + lsr $end_input_ptrx, $bit_length, #3 + add $end_input_ptr, $input_ptr, $end_input_ptrx @ end_input_ptr +#else add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr +#endif ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev $ctr96_b64x, $ctr96_b64x @@ -3204,10 +3232,10 @@ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 - add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + add $main_end_input_ptr, $main_end_input_ptr, $input_ptrx aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h @@ -3250,7 +3278,7 @@ fmov $ctr0d, $ctr96_b64x @ CTR block 4 rev64 $res1b, $res1b @ GHASH block 1 - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 8 blocks eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low #ifdef __AARCH64EB__ @@ -3459,7 +3487,7 @@ aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid - cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + cmp $input_ptrx, $main_end_input_ptr @ LOOP CONTROL eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high @@ -3724,7 +3752,7 @@ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low .L192_dec_tail: @ TAIL - sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + sub $main_end_input_ptr, $end_input_ptrx, $input_ptrx @ main_end_input_ptr is number of bytes left to process ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result @@ -3875,7 +3903,7 @@ .L192_dec_blocks_less_than_1: @ blocks left <= 1 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff - ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite + ldp $end_input_ptrx, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite and $bit_length, $bit_length, #127 @ bit_length %= 128 sub $bit_length, $bit_length, #128 @ bit_length -= 128 @@ -3893,9 +3921,9 @@ fmov $ctr0d, $ctr32x @ ctr0b is mask for last block and $output_l0, $output_l0, $ctr32x - bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes + bic $end_input_ptrx, $end_input_ptrx, $ctr32x @ mask out low existing bytes - orr $output_l0, $output_l0, $end_input_ptr + orr $output_l0, $output_l0, $end_input_ptrx mov $ctr0.d[1], $ctr96_b64x #ifndef __AARCH64EB__ rev $ctr32w, $rctr32w @@ -3957,13 +3985,13 @@ mov x0, $len st1 { $acc_l.16b }, [$current_tag] - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L192_dec_ret: @@ -3974,7 +4002,8 @@ } { -my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my ($end_input_ptrx,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); +my $end_input_ptr="PTR(4)"; my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); my ($output_l0,$output_h0)=map("x$_",(6..7)); @@ -4048,17 +4077,22 @@ aes_gcm_enc_256_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L256_enc_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] - + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov $counter, PTR(4) + mov $cc, PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + +#ifdef __CHERI_PURE_CAPABILITY__ + lsr $end_input_ptrx, $bit_length, #3 + add $end_input_ptr, $input_ptr, $end_input_ptrx @ end_input_ptr +#else add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr +#endif lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 @@ -4078,14 +4112,14 @@ and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) ld1 {$rk1s}, [$cc], #16 @ load rk1 - add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + add $main_end_input_ptr, $main_end_input_ptr, $input_ptrx lsr $rctr32x, $ctr96_t32x, #32 fmov $ctr2d, $ctr96_b64x @ CTR block 2 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w rev $rctr32w, $rctr32w @ rev_ctr32 - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 4 blocks fmov $ctr1d, $ctr96_b64x @ CTR block 1 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 @@ -4287,7 +4321,7 @@ eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 8 blocks fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low @@ -4541,7 +4575,7 @@ fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low - cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + cmp $input_ptrx, $main_end_input_ptr @ LOOP CONTROL fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high @@ -4776,7 +4810,7 @@ .L256_enc_tail: @ TAIL ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag - sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + sub $main_end_input_ptr, $end_input_ptrx, $input_ptrx @ main_end_input_ptr is number of bytes left to process ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev $input_l0, $input_l0 @@ -4998,13 +5032,13 @@ mov x0, $len st1 { $acc_l.16b }, [$current_tag] - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L256_enc_ret: @@ -5033,15 +5067,15 @@ aes_gcm_dec_256_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L256_dec_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov $counter, PTR(4) + mov $cc, PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] lsr $main_end_input_ptr, $bit_length, #3 @ byte_len mov $len, $main_end_input_ptr @@ -5061,7 +5095,12 @@ ld1 {$rk1s}, [$cc], #16 @ load rk1 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) +#ifdef __CHERI_PURE_CAPABILITY__ + lsr $end_input_ptrx, $bit_length, #3 + add $end_input_ptr, $input_ptr, $end_input_ptrx @ end_input_ptr +#else add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr +#endif ld1 {$rk2s}, [$cc], #16 @ load rk2 lsr $rctr32x, $ctr96_t32x, #32 @@ -5069,7 +5108,7 @@ orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w ld1 {$rk4s}, [$cc], #16 @ load rk4 - add $main_end_input_ptr, $main_end_input_ptr, $input_ptr + add $main_end_input_ptr, $main_end_input_ptr, $input_ptrx rev $rctr32w, $rctr32w @ rev_ctr32 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 @@ -5155,7 +5194,7 @@ aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 4 blocks aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 @@ -5307,7 +5346,7 @@ stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result - cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks + cmp $input_ptrx, $main_end_input_ptr @ check if we have <= 8 blocks b.ge .L256_dec_prepretail @ do prepretail .L256_dec_main_loop: @ main loop start @@ -5520,7 +5559,7 @@ aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 - cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL + cmp $input_ptrx, $main_end_input_ptr @ LOOP CONTROL add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 @@ -5773,7 +5812,7 @@ eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low .L256_dec_tail: @ TAIL - sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process + sub $main_end_input_ptr, $end_input_ptrx, $input_ptrx @ main_end_input_ptr is number of bytes left to process ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result @@ -5933,7 +5972,7 @@ sub $bit_length, $bit_length, #128 @ bit_length -= 128 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff - ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite + ldp $end_input_ptrx, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) and $bit_length, $bit_length, #127 @ bit_length %= 128 @@ -5948,7 +5987,7 @@ and $output_l0, $output_l0, $ctr32x mov $ctr0.d[1], $ctr96_b64x - bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes + bic $end_input_ptrx, $end_input_ptrx, $ctr32x @ mask out low existing bytes #ifndef __AARCH64EB__ rev $ctr32w, $rctr32w @@ -5958,7 +5997,7 @@ bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes - orr $output_l0, $output_l0, $end_input_ptr + orr $output_l0, $output_l0, $end_input_ptrx and $output_h0, $output_h0, $ctr96_b64x @@ -6017,13 +6056,13 @@ mov x0, $len st1 { $acc_l.16b }, [$current_tag] - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L256_dec_ret: From 471caecae30b7867471d2b8c366e6df356e57278 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 16 May 2024 12:40:22 -0700 Subject: [PATCH 11/19] OpenSSL poly1305-armv8.pl: Add purecap support --- .../crypto/poly1305/asm/poly1305-armv8.pl | 80 ++++++++++--------- 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/crypto/openssl/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/openssl/crypto/poly1305/asm/poly1305-armv8.pl index ca1be8d72d3b..688d7a86e457 100755 --- a/crypto/openssl/crypto/poly1305/asm/poly1305-armv8.pl +++ b/crypto/openssl/crypto/poly1305/asm/poly1305-armv8.pl @@ -49,8 +49,10 @@ or die "can't call $xlate: $!"; *STDOUT=*OUT; -my ($ctx,$inp,$len,$padbit) = map("x$_",(0..3)); -my ($mac,$nonce)=($inp,$len); +my ($ctx,$inp,$len,$padbit) = ("PTR(0)","PTR(1)","x2","x3"); +my ($mac,$nonce)=($inp,"PTR(2)"); +my ($inpx) = ("x1"); +my ($func,$r0p,$r1p,$d0p,$d1p) = map("PTR($_)",(2,7..8,12..13)); my ($h0,$h1,$h2,$r0,$r1,$s1,$t0,$t1,$d0,$d1,$d2) = map("x$_",(4..14)); @@ -73,15 +75,15 @@ .align 5 poly1305_init: AARCH64_VALID_CALL_TARGET - cmp $inp,xzr + cmp $inpx,xzr stp xzr,xzr,[$ctx] // zero hash value stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] - csel x0,xzr,x0,eq + csel $ctx,PTR(zr),$ctx,eq b.eq .Lno_key - adrp x17,OPENSSL_armcap_P - ldr w17,[x17,#:lo12:OPENSSL_armcap_P] + adrp PTR(17),OPENSSL_armcap_P + ldr w17,[PTR(17),#:lo12:OPENSSL_armcap_P] ldp $r0,$r1,[$inp] // load key mov $s1,#0xfffffffc0fffffff @@ -97,18 +99,18 @@ tst w17,#ARMV7_NEON - adr $d0,.Lpoly1305_blocks - adr $r0,.Lpoly1305_blocks_neon - adr $d1,.Lpoly1305_emit - adr $r1,.Lpoly1305_emit_neon + adr $d0p,.Lpoly1305_blocks + adr $r0p,.Lpoly1305_blocks_neon + adr $d1p,.Lpoly1305_emit + adr $r1p,.Lpoly1305_emit_neon - csel $d0,$d0,$r0,eq - csel $d1,$d1,$r1,eq + csel $d0p,$d0p,$r0p,eq + csel $d1p,$d1p,$r1p,eq #ifdef __ILP32__ - stp w12,w13,[$len] + stp w12,w13,[$func] #else - stp $d0,$d1,[$len] + stp $d0p,$d1p,[$func] #endif mov x0,#1 @@ -117,6 +119,7 @@ .size poly1305_init,.-poly1305_init .type poly1305_blocks,%function +.type .Lpoly1305_blocks,%function .align 5 poly1305_blocks: .Lpoly1305_blocks: @@ -185,6 +188,7 @@ .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,%function +.type Lpoly1305_emit,%function .align 5 poly1305_emit: .Lpoly1305_emit: @@ -226,8 +230,8 @@ my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28)); my ($T0,$T1,$MASK) = map("v$_",(29..31)); -my ($in2,$zeros)=("x16","x17"); -my $is_base2_26 = $zeros; # borrow +my ($in2,$zeros)=("PTR(16)","PTR(17)"); +my $is_base2_26 = "x17"; # borrow $code.=<<___; .type poly1305_mult,%function @@ -295,6 +299,7 @@ .size poly1305_splat,.-poly1305_splat .type poly1305_blocks_neon,%function +.type .Lpoly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: .Lpoly1305_blocks_neon: @@ -308,8 +313,8 @@ .Lblocks_neon: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH+64)]! + add PTR(29),PTRN(sp),#0 ands $len,$len,#-16 b.eq .Lno_data_neon @@ -354,7 +359,7 @@ adc $h2,$h2,$padbit bl poly1305_mult - ldr x30,[sp,#8] + ldr PTR(30),[PTRN(sp),#PTR_WIDTH] cbz $padbit,.Lstore_base2_64_neon @@ -409,10 +414,10 @@ ubfx x13,$h1,#14,#26 extr x14,$h2,$h1,#40 - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] + stp d8,d9,[PTRN(sp),#2*PTR_WIDTH] // meet ABI requirements + stp d10,d11,[PTRN(sp),#2*PTR_WIDTH+16] + stp d12,d13,[PTRN(sp),#2*PTR_WIDTH+32] + stp d14,d15,[PTRN(sp),#2*PTR_WIDTH+48] fmov ${H0},x10 fmov ${H1},x11 @@ -439,7 +444,7 @@ bl poly1305_mult // r^4 sub $ctx,$ctx,#4 bl poly1305_splat - ldr x30,[sp,#8] + ldr PTR(30),[PTRN(sp),#PTR_WIDTH] add $in2,$inp,#32 adr $zeros,.Lzeros @@ -458,10 +463,10 @@ subs $len,$len,#64 csel $in2,$zeros,$in2,lo - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] + stp d8,d9,[PTRN(sp),#2*PTR_WIDTH] // meet ABI requirements + stp d10,d11,[PTRN(sp),#2*PTR_WIDTH+16] + stp d12,d13,[PTRN(sp),#2*PTR_WIDTH+32] + stp d14,d15,[PTRN(sp),#2*PTR_WIDTH+48] fmov ${H0},x10 fmov ${H1},x11 @@ -474,7 +479,7 @@ ldp x9,x13,[$in2],#48 lsl $padbit,$padbit,#24 - add x15,$ctx,#48 + add PTR(15),$ctx,#48 #ifdef __AARCH64EB__ rev x8,x8 @@ -508,9 +513,9 @@ ldp x8,x12,[$inp],#16 // inp[0:1] ldp x9,x13,[$inp],#48 - ld1 {$R0,$R1,$S1,$R2},[x15],#64 - ld1 {$S2,$R3,$S3,$R4},[x15],#64 - ld1 {$S4},[x15] + ld1 {$R0,$R1,$S1,$R2},[PTR(15)],#64 + ld1 {$S2,$R3,$S3,$R4},[PTR(15)],#64 + ld1 {$S4},[PTR(15)] #ifdef __AARCH64EB__ rev x8,x8 @@ -829,13 +834,13 @@ // horizontal add addp $ACC3,$ACC3,$ACC3 - ldp d8,d9,[sp,#16] // meet ABI requirements + ldp d8,d9,[PTRN(sp),#2*PTR_WIDTH] // meet ABI requirements addp $ACC0,$ACC0,$ACC0 - ldp d10,d11,[sp,#32] + ldp d10,d11,[PTRN(sp),#2*PTR_WIDTH+16] addp $ACC4,$ACC4,$ACC4 - ldp d12,d13,[sp,#48] + ldp d12,d13,[PTRN(sp),#2*PTR_WIDTH+32] addp $ACC1,$ACC1,$ACC1 - ldp d14,d15,[sp,#64] + ldp d14,d15,[PTRN(sp),#2*PTR_WIDTH+48] addp $ACC2,$ACC2,$ACC2 //////////////////////////////////////////////////////////////// @@ -876,12 +881,13 @@ st1 {$ACC4}[0],[$ctx] .Lno_data_neon: - ldr x29,[sp],#80 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH+64) AARCH64_VALIDATE_LINK_REGISTER ret .size poly1305_blocks_neon,.-poly1305_blocks_neon .type poly1305_emit_neon,%function +.type .Lpoly1305_emit_neon,%function .align 5 poly1305_emit_neon: .Lpoly1305_emit_neon: From d5b2df364fceca4aa0cbe7c02f8a1aaa72af13e0 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 16 May 2024 15:24:33 -0700 Subject: [PATCH 12/19] OpenSSL keccak1600-armv8.pl: Add purecap support --- .../crypto/sha/asm/keccak1600-armv8.pl | 332 +++++++++--------- 1 file changed, 168 insertions(+), 164 deletions(-) diff --git a/crypto/openssl/crypto/sha/asm/keccak1600-armv8.pl b/crypto/openssl/crypto/sha/asm/keccak1600-armv8.pl index 40f7aa7a695a..510a3f97d3cc 100755 --- a/crypto/openssl/crypto/sha/asm/keccak1600-armv8.pl +++ b/crypto/openssl/crypto/sha/asm/keccak1600-armv8.pl @@ -121,20 +121,21 @@ $A[3][3] = "x25"; # x18 is reserved my @C = map("x$_", (26,27,28,30)); +my @P = map("PTR($_)", (26,27,28,30)); $code.=<<___; .type KeccakF1600_int,%function .align 5 KeccakF1600_int: AARCH64_SIGN_LINK_REGISTER - adr $C[2],iotas - stp $C[2],x30,[sp,#16] // 32 bytes on top are mine + adr @P[2],iotas + stp @P[2],PTR(30),[PTRN(sp),#(2*PTR_WIDTH)] // 32 bytes on top are mine b .Loop .align 4 .Loop: ////////////////////////////////////////// Theta eor $C[0],$A[0][0],$A[1][0] - stp $A[0][4],$A[1][4],[sp,#0] // offload pair... + stp $A[0][4],$A[1][4],[PTRN(sp),#0] // offload pair... eor $C[1],$A[0][1],$A[1][1] eor $C[2],$A[0][2],$A[1][2] eor $C[3],$A[0][3],$A[1][3] @@ -187,7 +188,7 @@ $C[4]=undef; $C[5]=undef; $code.=<<___; - ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data + ldp $A[0][4],$A[1][4],[PTRN(sp),#0] // re-load offloaded data eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3] eor $A[1][3],$A[1][3],$C[2] eor $A[2][3],$A[2][3],$C[2] @@ -243,15 +244,15 @@ eor $A[0][0],$A[0][0],$C[0] bic $C[0],$A[0][4],$A[0][3] eor $A[0][1],$A[0][1],$C[1] - ldr $C[1],[sp,#16] + ldr $P[1],[PTRN(sp),#16] eor $A[0][3],$A[0][3],$C[2] eor $A[0][4],$A[0][4],$C[3] eor $A[0][2],$A[0][2],$C[0] - ldr $C[3],[$C[1]],#8 // Iota[i++] + ldr $C[3],[$P[1]],#8 // Iota[i++] bic $C[0],$A[1][2],$A[1][1] tst $C[1],#255 // are we done? - str $C[1],[sp,#16] + str $C[1],[PTRN(sp),#16] bic $C[1],$A[1][3],$A[1][2] bic $C[2],$A[1][0],$A[1][4] eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota @@ -298,7 +299,7 @@ bne .Loop - ldr x30,[sp,#24] + ldr PTR(30),[PTRN(sp),#(2*PTR_WIDTH+8)] AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600_int,.-KeccakF1600_int @@ -307,55 +308,55 @@ .align 5 KeccakF1600: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#48 - - str x0,[sp,#32] // offload argument - mov $C[0],x0 - ldp $A[0][0],$A[0][1],[x0,#16*0] - ldp $A[0][2],$A[0][3],[$C[0],#16*1] - ldp $A[0][4],$A[1][0],[$C[0],#16*2] - ldp $A[1][1],$A[1][2],[$C[0],#16*3] - ldp $A[1][3],$A[1][4],[$C[0],#16*4] - ldp $A[2][0],$A[2][1],[$C[0],#16*5] - ldp $A[2][2],$A[2][3],[$C[0],#16*6] - ldp $A[2][4],$A[3][0],[$C[0],#16*7] - ldp $A[3][1],$A[3][2],[$C[0],#16*8] - ldp $A[3][3],$A[3][4],[$C[0],#16*9] - ldp $A[4][0],$A[4][1],[$C[0],#16*10] - ldp $A[4][2],$A[4][3],[$C[0],#16*11] - ldr $A[4][4],[$C[0],#16*12] + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#(2*PTR_WIDTH+32) + + str PTR(0),[PTRN(sp),#32] // offload argument + mov $P[0],PTR(0) + ldp $A[0][0],$A[0][1],[PTR(0),#16*0] + ldp $A[0][2],$A[0][3],[$P[0],#16*1] + ldp $A[0][4],$A[1][0],[$P[0],#16*2] + ldp $A[1][1],$A[1][2],[$P[0],#16*3] + ldp $A[1][3],$A[1][4],[$P[0],#16*4] + ldp $A[2][0],$A[2][1],[$P[0],#16*5] + ldp $A[2][2],$A[2][3],[$P[0],#16*6] + ldp $A[2][4],$A[3][0],[$P[0],#16*7] + ldp $A[3][1],$A[3][2],[$P[0],#16*8] + ldp $A[3][3],$A[3][4],[$P[0],#16*9] + ldp $A[4][0],$A[4][1],[$P[0],#16*10] + ldp $A[4][2],$A[4][3],[$P[0],#16*11] + ldr $A[4][4],[$P[0],#16*12] bl KeccakF1600_int - ldr $C[0],[sp,#32] - stp $A[0][0],$A[0][1],[$C[0],#16*0] - stp $A[0][2],$A[0][3],[$C[0],#16*1] - stp $A[0][4],$A[1][0],[$C[0],#16*2] - stp $A[1][1],$A[1][2],[$C[0],#16*3] - stp $A[1][3],$A[1][4],[$C[0],#16*4] - stp $A[2][0],$A[2][1],[$C[0],#16*5] - stp $A[2][2],$A[2][3],[$C[0],#16*6] - stp $A[2][4],$A[3][0],[$C[0],#16*7] - stp $A[3][1],$A[3][2],[$C[0],#16*8] - stp $A[3][3],$A[3][4],[$C[0],#16*9] - stp $A[4][0],$A[4][1],[$C[0],#16*10] - stp $A[4][2],$A[4][3],[$C[0],#16*11] - str $A[4][4],[$C[0],#16*12] - - ldp x19,x20,[x29,#16] - add sp,sp,#48 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 + ldr $P[0],[PTRN(sp),#32] + stp $A[0][0],$A[0][1],[$P[0],#16*0] + stp $A[0][2],$A[0][3],[$P[0],#16*1] + stp $A[0][4],$A[1][0],[$P[0],#16*2] + stp $A[1][1],$A[1][2],[$P[0],#16*3] + stp $A[1][3],$A[1][4],[$P[0],#16*4] + stp $A[2][0],$A[2][1],[$P[0],#16*5] + stp $A[2][2],$A[2][3],[$P[0],#16*6] + stp $A[2][4],$A[3][0],[$P[0],#16*7] + stp $A[3][1],$A[3][2],[$P[0],#16*8] + stp $A[3][3],$A[3][4],[$P[0],#16*9] + stp $A[4][0],$A[4][1],[$P[0],#16*10] + stp $A[4][2],$A[4][3],[$P[0],#16*11] + str $A[4][4],[$P[0],#16*12] + + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#(2*PTR_WIDTH+32) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(16*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600,.-KeccakF1600 @@ -365,35 +366,35 @@ .align 5 SHA3_absorb: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - stp x0,x1,[sp,#32] // offload arguments - stp x2,x3,[sp,#48] - - mov $C[0],x0 // uint64_t A[5][5] - mov $C[1],x1 // const void *inp + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#(2*PTR_WIDTH+48) + + stp PTR(0),PTR(1),[PTRN(sp),#32] // offload arguments + stp x2,x3,[PTRN(sp),#(2*PTR_WIDTH+32)] + + mov $P[0],PTR(0) // uint64_t A[5][5] + mov $P[1],PTR(1) // const void *inp mov $C[2],x2 // size_t len mov $C[3],x3 // size_t bsz - ldp $A[0][0],$A[0][1],[$C[0],#16*0] - ldp $A[0][2],$A[0][3],[$C[0],#16*1] - ldp $A[0][4],$A[1][0],[$C[0],#16*2] - ldp $A[1][1],$A[1][2],[$C[0],#16*3] - ldp $A[1][3],$A[1][4],[$C[0],#16*4] - ldp $A[2][0],$A[2][1],[$C[0],#16*5] - ldp $A[2][2],$A[2][3],[$C[0],#16*6] - ldp $A[2][4],$A[3][0],[$C[0],#16*7] - ldp $A[3][1],$A[3][2],[$C[0],#16*8] - ldp $A[3][3],$A[3][4],[$C[0],#16*9] - ldp $A[4][0],$A[4][1],[$C[0],#16*10] - ldp $A[4][2],$A[4][3],[$C[0],#16*11] - ldr $A[4][4],[$C[0],#16*12] + ldp $A[0][0],$A[0][1],[$P[0],#16*0] + ldp $A[0][2],$A[0][3],[$P[0],#16*1] + ldp $A[0][4],$A[1][0],[$P[0],#16*2] + ldp $A[1][1],$A[1][2],[$P[0],#16*3] + ldp $A[1][3],$A[1][4],[$P[0],#16*4] + ldp $A[2][0],$A[2][1],[$P[0],#16*5] + ldp $A[2][2],$A[2][3],[$P[0],#16*6] + ldp $A[2][4],$A[3][0],[$P[0],#16*7] + ldp $A[3][1],$A[3][2],[$P[0],#16*8] + ldp $A[3][3],$A[3][4],[$P[0],#16*9] + ldp $A[4][0],$A[4][1],[$P[0],#16*10] + ldp $A[4][2],$A[4][3],[$P[0],#16*11] + ldr $A[4][4],[$P[0],#16*12] b .Loop_absorb .align 4 @@ -401,19 +402,19 @@ subs $C[0],$C[2],$C[3] // len - bsz blo .Labsorbed - str $C[0],[sp,#48] // save len - bsz + str $C[0],[PTRN(sp),#(2*PTR_WIDTH+32)] // save len - bsz ___ for (my $i=0; $i<24; $i+=2) { my $j = $i+1; $code.=<<___; - ldr $C[0],[$C[1]],#8 // *inp++ + ldr $C[0],[$P[1]],#8 // *inp++ #ifdef __AARCH64EB__ rev $C[0],$C[0] #endif eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] cmp $C[3],#8*($i+2) blo .Lprocess_block - ldr $C[0],[$C[1]],#8 // *inp++ + ldr $C[0],[$P[1]],#8 // *inp++ #ifdef __AARCH64EB__ rev $C[0],$C[0] #endif @@ -422,70 +423,71 @@ ___ } $code.=<<___; - ldr $C[0],[$C[1]],#8 // *inp++ + ldr $C[0],[$P[1]],#8 // *inp++ #ifdef __AARCH64EB__ rev $C[0],$C[0] #endif eor $A[4][4],$A[4][4],$C[0] .Lprocess_block: - str $C[1],[sp,#40] // save inp + str $P[1],[PTRN(sp),#(PTR_WIDTH+32)] // save inp bl KeccakF1600_int - ldr $C[1],[sp,#40] // restore arguments - ldp $C[2],$C[3],[sp,#48] + ldr $P[1],[PTRN(sp),#(PTR_WIDTH+32)] // restore arguments + ldp $C[2],$C[3],[PTRN(sp),#(2*PTR_WIDTH+32)] b .Loop_absorb .align 4 .Labsorbed: - ldr $C[1],[sp,#32] - stp $A[0][0],$A[0][1],[$C[1],#16*0] - stp $A[0][2],$A[0][3],[$C[1],#16*1] - stp $A[0][4],$A[1][0],[$C[1],#16*2] - stp $A[1][1],$A[1][2],[$C[1],#16*3] - stp $A[1][3],$A[1][4],[$C[1],#16*4] - stp $A[2][0],$A[2][1],[$C[1],#16*5] - stp $A[2][2],$A[2][3],[$C[1],#16*6] - stp $A[2][4],$A[3][0],[$C[1],#16*7] - stp $A[3][1],$A[3][2],[$C[1],#16*8] - stp $A[3][3],$A[3][4],[$C[1],#16*9] - stp $A[4][0],$A[4][1],[$C[1],#16*10] - stp $A[4][2],$A[4][3],[$C[1],#16*11] - str $A[4][4],[$C[1],#16*12] + ldr $P[1],[PTRN(sp),#32] + stp $A[0][0],$A[0][1],[$P[1],#16*0] + stp $A[0][2],$A[0][3],[$P[1],#16*1] + stp $A[0][4],$A[1][0],[$P[1],#16*2] + stp $A[1][1],$A[1][2],[$P[1],#16*3] + stp $A[1][3],$A[1][4],[$P[1],#16*4] + stp $A[2][0],$A[2][1],[$P[1],#16*5] + stp $A[2][2],$A[2][3],[$P[1],#16*6] + stp $A[2][4],$A[3][0],[$P[1],#16*7] + stp $A[3][1],$A[3][2],[$P[1],#16*8] + stp $A[3][3],$A[3][4],[$P[1],#16*9] + stp $A[4][0],$A[4][1],[$P[1],#16*10] + stp $A[4][2],$A[4][3],[$P[1],#16*11] + str $A[4][4],[$P[1],#16*12] mov x0,$C[2] // return value - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#(2*PTR_WIDTH+48) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(16*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_absorb,.-SHA3_absorb ___ { -my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); +my ($A_flat,$out) = map("PTR($_)",(19..20)); +my ($len,$bsz) = map("x$_",(21..22)); $code.=<<___; .globl SHA3_squeeze .type SHA3_squeeze,%function .align 5 SHA3_squeeze: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] + stp PTR(29),PTR(30),[PTRN(sp),#-(6*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] - mov $A_flat,x0 // put aside arguments - mov $out,x1 + mov $A_flat,PTR(0) // put aside arguments + mov $out,PTR(1) mov $len,x2 mov $bsz,x3 .Loop_squeeze: - ldr x4,[x0],#8 + ldr x4,[PTR(0)],#8 cmp $len,#8 blo .Lsqueeze_tail #ifdef __AARCH64EB__ @@ -498,9 +500,9 @@ subs x3,x3,#8 bhi .Loop_squeeze - mov x0,$A_flat + mov PTR(0),$A_flat bl KeccakF1600 - mov x0,$A_flat + mov PTR(0),$A_flat mov x3,$bsz b .Loop_squeeze @@ -533,9 +535,9 @@ strb w4,[$out],#1 .Lsqueeze_done: - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x29,x30,[sp],#48 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(6*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_squeeze,.-SHA3_squeeze @@ -554,7 +556,7 @@ .align 5 KeccakF1600_ce: mov x9,#24 - adr x10,iotas + adr PTR(10),iotas b .Loop_ce .align 4 .Loop_ce: @@ -618,7 +620,7 @@ bcax $A[4][3],$A[4][3],$C[1], $A[4][4] bcax $A[4][4],$A[4][4],$A[1][3],$C[1] // A[1][3]=A[4][1] - ld1r {$C[1]},[x10],#8 + ld1r {$C[1]},[PTR(10)],#8 bcax $A[3][2],$D[1], $A[3][4],$A[0][3] // A[0][3]=A[3][3] bcax $A[3][3],$A[0][3],$A[3][0],$A[3][4] // A[0][3]=A[3][3] @@ -656,45 +658,46 @@ .align 5 KeccakF1600_cext: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp d8,d9,[sp,#16] // per ABI requirement - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH+64)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] // per ABI requirement + stp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + stp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + stp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] ___ for($i=0; $i<24; $i+=2) { # load A[5][5] my $j=$i+1; $code.=<<___; - ldp d$i,d$j,[x0,#8*$i] + ldp d$i,d$j,[PTR(0),#8*$i] ___ } $code.=<<___; - ldr d24,[x0,#8*$i] + ldr d24,[PTR(0),#8*$i] bl KeccakF1600_ce - ldr x30,[sp,#8] + ldr PTR(30),[PTRN(sp),#PTR_WIDTH] ___ for($i=0; $i<24; $i+=2) { # store A[5][5] my $j=$i+1; $code.=<<___; - stp d$i,d$j,[x0,#8*$i] + stp d$i,d$j,[PTR(0),#8*$i] ___ } $code.=<<___; - str d24,[x0,#8*$i] + str d24,[PTR(0),#8*$i] - ldp d8,d9,[sp,#16] - ldp d10,d11,[sp,#32] - ldp d12,d13,[sp,#48] - ldp d14,d15,[sp,#64] - ldr x29,[sp],#80 + ldp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] + ldp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + ldp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + ldp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH+64) AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600_cext,.-KeccakF1600_cext ___ { -my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); +my ($ctx,$inp) = map("PTR($_)",(0..1)); +my ($len,$bsz) = map("x$_",(2..3)); $code.=<<___; .globl SHA3_absorb_cext @@ -702,21 +705,21 @@ .align 5 SHA3_absorb_cext: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp d8,d9,[sp,#16] // per ABI requirement - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH+64)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] // per ABI requirement + stp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + stp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + stp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] ___ for($i=0; $i<24; $i+=2) { # load A[5][5] my $j=$i+1; $code.=<<___; - ldp d$i,d$j,[x0,#8*$i] + ldp d$i,d$j,[PTR(0),#8*$i] ___ } $code.=<<___; - ldr d24,[x0,#8*$i] + ldr d24,[PTR(0),#8*$i] b .Loop_absorb_ce .align 4 @@ -761,38 +764,39 @@ for($i=0; $i<24; $i+=2) { # store A[5][5] my $j=$i+1; $code.=<<___; - stp d$i,d$j,[x0,#8*$i] + stp d$i,d$j,[PTR(0),#8*$i] ___ } $code.=<<___; - str d24,[x0,#8*$i] + str d24,[PTR(0),#8*$i] add x0,$len,$bsz // return value - ldp d8,d9,[sp,#16] - ldp d10,d11,[sp,#32] - ldp d12,d13,[sp,#48] - ldp d14,d15,[sp,#64] - ldp x29,x30,[sp],#80 + ldp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] + ldp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + ldp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + ldp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH+64) AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_absorb_cext,.-SHA3_absorb_cext ___ } { -my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); +my ($ctx,$out) = map("PTR($_)",(0..1)); +my ($len,$bsz) = map("x$_",(2..3)); $code.=<<___; .globl SHA3_squeeze_cext .type SHA3_squeeze_cext,%function .align 5 SHA3_squeeze_cext: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - mov x9,$ctx + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + mov PTR(9),$ctx mov x10,$bsz .Loop_squeeze_ce: - ldr x4,[x9],#8 + ldr x4,[PTR(9)],#8 cmp $len,#8 blo .Lsqueeze_tail_ce #ifdef __AARCH64EB__ @@ -806,8 +810,8 @@ bhi .Loop_squeeze_ce bl KeccakF1600_cext - ldr x30,[sp,#8] - mov x9,$ctx + ldr PTR(30),[PTRN(sp),#PTR_WIDTH] + mov PTR(9),$ctx mov x10,$bsz b .Loop_squeeze_ce @@ -840,7 +844,7 @@ strb w4,[$out],#1 .Lsqueeze_done_ce: - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_squeeze_cext,.-SHA3_squeeze_cext From 6b442fe134ab909940ded43514f5037ffd66a250 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 16 May 2024 15:36:14 -0700 Subject: [PATCH 13/19] OpenSSL sha1-armv8.pl: Add purecap support --- crypto/openssl/crypto/sha/asm/sha1-armv8.pl | 42 ++++++++++----------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/crypto/openssl/crypto/sha/asm/sha1-armv8.pl b/crypto/openssl/crypto/sha/asm/sha1-armv8.pl index 5f23a20c1ab7..728576bfab09 100755 --- a/crypto/openssl/crypto/sha/asm/sha1-armv8.pl +++ b/crypto/openssl/crypto/sha/asm/sha1-armv8.pl @@ -47,7 +47,7 @@ or die "can't call $xlate: $1"; *STDOUT=*OUT; -($ctx,$inp,$num)=("x0","x1","x2"); +($ctx,$inp,$num)=("PTR(0)","PTR(1)","x2"); @Xw=map("w$_",(3..17,19)); @Xx=map("x$_",(3..17,19)); @V=($A,$B,$C,$D,$E)=map("w$_",(20..24)); @@ -188,19 +188,19 @@ sub BODY_20_39 { .align 6 sha1_block_data_order: AARCH64_VALID_CALL_TARGET - adrp x16,OPENSSL_armcap_P - ldr w16,[x16,#:lo12:OPENSSL_armcap_P] + adrp PTR(16),OPENSSL_armcap_P + ldr w16,[PTR(16),#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] ldp $A,$B,[$ctx] ldp $C,$D,[$ctx,#8] @@ -234,12 +234,12 @@ sub BODY_20_39 { str $E,[$ctx,#16] cbnz $num,.Loop - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldp x25,x26,[sp,#64] - ldp x27,x28,[sp,#80] - ldr x29,[sp],#96 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(12*PTR_WIDTH) ret .size sha1_block_data_order,.-sha1_block_data_order ___ @@ -256,15 +256,15 @@ sub BODY_20_39 { sha1_block_armv8: .Lv8_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - adr x4,.Lconst + adr PTR(4),.Lconst eor $E,$E,$E ld1.32 {$ABCD},[$ctx],#16 ld1.32 {$E}[0],[$ctx] sub $ctx,$ctx,#16 - ld1.32 {@Kxx[0]-@Kxx[3]},[x4] + ld1.32 {@Kxx[0]-@Kxx[3]},[PTR(4)] .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 @@ -316,7 +316,7 @@ sub BODY_20_39 { st1.32 {$ABCD},[$ctx],#16 st1.32 {$E}[0],[$ctx] - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size sha1_block_armv8,.-sha1_block_armv8 .align 6 From 0c2ca4cfc03905b80d13c39f09bb78c11e27c6b1 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Thu, 16 May 2024 16:03:42 -0700 Subject: [PATCH 14/19] OpenSSL sha512-armv8.pl: Add purecap support --- crypto/openssl/crypto/sha/asm/sha512-armv8.pl | 134 ++++++++++-------- 1 file changed, 77 insertions(+), 57 deletions(-) diff --git a/crypto/openssl/crypto/sha/asm/sha512-armv8.pl b/crypto/openssl/crypto/sha/asm/sha512-armv8.pl index f900882fee8b..f79641089881 100755 --- a/crypto/openssl/crypto/sha/asm/sha512-armv8.pl +++ b/crypto/openssl/crypto/sha/asm/sha512-armv8.pl @@ -94,7 +94,8 @@ $func="sha${BITS}_block_data_order"; -($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); +($ctx,$inp,$num,$Ktbl)=map("PTR($_)",(0..2,30)); +($inpx,$numx)=map("x$_",(1..2)); @X=map("$reg_t$_",(3..15,0..2)); @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); @@ -118,13 +119,13 @@ sub BODY_00_xx { ldp @X[14],@X[15],[$inp] ___ $code.=<<___ if ($i>=14); - ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] + ldr @X[($i-11)&15],[PTRN(sp),#`$SZ*(($i-11)%4)`] ___ $code.=<<___ if ($i>0 && $i<16); add $a,$a,$t1 // h+=Sigma0(a) ___ $code.=<<___ if ($i>=11); - str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] + str @X[($i-8)&15],[PTRN(sp),#`$SZ*(($i-8)%4)`] ___ # While ARMv8 specifies merged rotate-n-logical operation such as # 'eor x,y,z,ror#n', it was found to negatively affect performance @@ -204,8 +205,8 @@ sub BODY_00_xx { $func: AARCH64_VALID_CALL_TARGET #ifndef __KERNEL__ - adrp x16,OPENSSL_armcap_P - ldr w16,[x16,#:lo12:OPENSSL_armcap_P] + adrp PTR(16),OPENSSL_armcap_P + ldr w16,[PTR(16),#:lo12:OPENSSL_armcap_P] ___ $code.=<<___ if ($SZ==4); tst w16,#ARMV8_SHA256 @@ -220,29 +221,34 @@ sub BODY_00_xx { $code.=<<___; #endif AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*$SZ + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#4*$SZ ldp $A,$B,[$ctx] // load context ldp $C,$D,[$ctx,#2*$SZ] ldp $E,$F,[$ctx,#4*$SZ] +#ifdef __CHERI_PURE_CAPABILITY__ + lsl x17,$numx,#`log(16*$SZ)/log(2)` + add $num,$inp,x17 // end of input +#else add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input +#endif ldp $G,$H,[$ctx,#6*$SZ] adr $Ktbl,.LK$BITS - stp $ctx,$num,[x29,#96] + stp $ctx,$num,[PTR(29),#(12*PTR_WIDTH)] .Loop: ldp @X[0],@X[1],[$inp],#2*$SZ ldr $t2,[$Ktbl],#$SZ // *K++ eor $t3,$B,$C // magic seed - str $inp,[x29,#112] + str $inp,[PTR(29),#(14*PTR_WIDTH)] ___ for ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } $code.=".Loop_16_xx:\n"; @@ -250,8 +256,8 @@ sub BODY_00_xx { $code.=<<___; cbnz $t2,.Loop_16_xx - ldp $ctx,$num,[x29,#96] - ldr $inp,[x29,#112] + ldp $ctx,$num,[PTR(29),#(12*PTR_WIDTH)] + ldr $inp,[PTR(29),#(14*PTR_WIDTH)] sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind ldp @X[0],@X[1],[$ctx] @@ -269,18 +275,18 @@ sub BODY_00_xx { stp $C,$D,[$ctx,#2*$SZ] add $G,$G,@X[6] add $H,$H,@X[7] - cmp $inp,$num + cmp $inpx,$numx stp $E,$F,[$ctx,#4*$SZ] stp $G,$H,[$ctx,#6*$SZ] b.ne .Loop - ldp x19,x20,[x29,#16] - add sp,sp,#4*$SZ - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#4*$SZ + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(16*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size $func,.-$func @@ -358,7 +364,7 @@ sub BODY_00_xx { ___ if ($SZ==4) { -my $Ktbl="x3"; +my $Ktbl="PTR(3)"; my ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); my @MSG=map("v$_.16b",(4..7)); @@ -372,15 +378,15 @@ sub BODY_00_xx { sha256_block_armv8: .Lv8_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ld1.32 {$ABCD,$EFGH},[$ctx] adr $Ktbl,.LK256 .Loop_hw: ld1 {@MSG[0]-@MSG[3]},[$inp],#64 - sub $num,$num,#1 + sub $numx,$numx,#1 ld1.32 {$W0},[$Ktbl],#16 rev32 @MSG[0],@MSG[0] rev32 @MSG[1],@MSG[1] @@ -429,11 +435,11 @@ sub BODY_00_xx { add.i32 $ABCD,$ABCD,$ABCD_SAVE add.i32 $EFGH,$EFGH,$EFGH_SAVE - cbnz $num,.Loop_hw + cbnz $numx,.Loop_hw st1.32 {$ABCD,$EFGH},[$ctx] - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size sha256_block_armv8,.-sha256_block_armv8 #endif @@ -448,8 +454,9 @@ sub BODY_00_xx { my @V = ($A,$B,$C,$D,$E,$F,$G,$H) = map("w$_",(3..10)); my ($t0,$t1,$t2,$t3,$t4) = map("w$_",(11..15)); -my $Ktbl="x16"; -my $Xfer="x17"; +my $Ktbl="PTR(16)"; +my $Xfer="PTR(17)"; +my $temp="x17"; my @X = map("q$_",(0..3)); my ($T0,$T1,$T2,$T3,$T4,$T5,$T6,$T7) = map("q$_",(4..7,16..19)); my $j=0; @@ -617,7 +624,7 @@ () '&eor ($t2,$a,$b)', # a^b, b^c in next round '&eor ($t4,$t4,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) '&add ($h,$h,$t0)', # h+=Sigma1(e) - '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. + '&ldr ($t1,sprintf "[PTRN(sp),#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. '&ldr ($t1,"[$Ktbl]") if ($j==15);'. '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) '&ror ($t4,$t4,"#$Sigma0[0]")', @@ -637,12 +644,17 @@ () AARCH64_VALID_CALL_TARGET .Lneon_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 + stp PTR(29), PTR(30), [PTRN(sp), #-(2*PTR_WIDTH)]! + mov PTR(29), PTRN(sp) + sub PTRN(sp),PTRN(sp),#16*4 adr $Ktbl,.LK256 +#ifdef __CHERI_PURE_CAPABILITY__ + lsl $temp,$numx,#6 + add $num,$inp,$temp // len to point at the end of inp +#else add $num,$inp,$num,lsl#6 // len to point at the end of inp +#endif ld1.8 {@X[0]},[$inp], #16 ld1.8 {@X[1]},[$inp], #16 @@ -656,7 +668,7 @@ () rev32 @X[1],@X[1] // big-endian rev32 @X[2],@X[2] rev32 @X[3],@X[3] - mov $Xfer,sp + mov $Xfer,PTRN(sp) add.32 $T0,$T0,@X[0] add.32 $T1,$T1,@X[1] add.32 $T2,$T2,@X[2] @@ -669,7 +681,7 @@ () ldp $C,$D,[$ctx,#8] ldp $E,$F,[$ctx,#16] ldp $G,$H,[$ctx,#24] - ldr $t1,[sp,#0] + ldr $t1,[PTRN(sp),#0] mov $t2,wzr eor $t3,$B,$C mov $t4,wzr @@ -684,16 +696,24 @@ () &Xupdate(\&body_00_15); $code.=<<___; cmp $t1,#0 // check for K256 terminator - ldr $t1,[sp,#0] + ldr $t1,[PTRN(sp),#0] sub $Xfer,$Xfer,#64 bne .L_00_48 sub $Ktbl,$Ktbl,#256 // rewind $Ktbl - cmp $inp,$num - mov $Xfer, #64 - csel $Xfer, $Xfer, xzr, eq - sub $inp,$inp,$Xfer // avoid SEGV - mov $Xfer,sp + cmp $inpx,$numx +#ifdef __CHERI_PURE_CAPABILITY__ + mov $temp, #-64 +#else + mov $temp, #64 +#endif + csel $temp, $temp, xzr, eq +#ifdef __CHERI_PURE_CAPABILITY__ + add $inp,$inp,$temp // avoid SEGV +#else + sub $inp,$inp,$temp // avoid SEGV +#endif + mov $Xfer,PTRN(sp) ___ &Xpreload(\&body_00_15); &Xpreload(\&body_00_15); @@ -712,7 +732,7 @@ () ldp $t2,$t3,[$ctx,#24] add $E,$E,$t0 add $F,$F,$t1 - ldr $t1,[sp,#0] + ldr $t1,[PTRN(sp),#0] stp $A,$B,[$ctx,#0] add $G,$G,$t2 mov $t2,wzr @@ -722,18 +742,18 @@ () eor $t3,$B,$C stp $G,$H,[$ctx,#24] mov $t4,wzr - mov $Xfer,sp + mov $Xfer,PTRN(sp) b.ne .L_00_48 - ldr x29,[x29] - add sp,sp,#16*4+16 + ldr PTR(29),[PTR(29)] + add PTRN(sp),PTRN(sp),#16*4+(2*PTR_WIDTH) ret .size sha256_block_neon,.-sha256_block_neon ___ } if ($SZ==8) { -my $Ktbl="x3"; +my $Ktbl="PTR(3)"; my @H = map("v$_.16b",(0..4)); my ($fg,$de,$m9_10)=map("v$_.16b",(5..7)); @@ -748,8 +768,8 @@ () sha512_block_armv8: .Lv8_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 ld1 {@MSG[0]-@MSG[3]},[$inp],#64 // load input ld1 {@MSG[4]-@MSG[7]},[$inp],#64 @@ -770,13 +790,13 @@ () .align 4 .Loop_hw: ld1.64 {$W0},[$Ktbl],#16 - subs $num,$num,#1 - sub x4,$inp,#128 + subs $numx,$numx,#1 + sub PTR(4),$inp,#128 orr $AB,@H[0],@H[0] // offload orr $CD,@H[1],@H[1] orr $EF,@H[2],@H[2] orr $GH,@H[3],@H[3] - csel $inp,$inp,x4,ne // conditional rewind + csel $inp,$inp,PTR(4),ne // conditional rewind ___ for($i=0;$i<32;$i++) { $code.=<<___; @@ -824,11 +844,11 @@ () add.i64 @H[2],@H[2],$EF add.i64 @H[3],@H[3],$GH - cbnz $num,.Loop_hw + cbnz $numx,.Loop_hw st1.64 {@H[0]-@H[3]},[$ctx] // store context - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size sha512_block_armv8,.-sha512_block_armv8 #endif From f0192a724090d2112a0645d094854166de71350d Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 1 May 2024 10:32:55 -0700 Subject: [PATCH 15/19] OpenSSL: Regen aarch64 assembly for Morello --- sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S | 876 +++++------ sys/crypto/openssl/aarch64/aesv8-armx.S | 986 +++++++------ sys/crypto/openssl/aarch64/arm64cpuid.S | 16 +- sys/crypto/openssl/aarch64/armv8-mont.S | 1314 +++++++++-------- sys/crypto/openssl/aarch64/chacha-armv8.S | 444 +++--- .../openssl/aarch64/ecp_nistz256-armv8.S | 1086 +++++++------- sys/crypto/openssl/aarch64/ghashv8-armx.S | 50 +- sys/crypto/openssl/aarch64/keccak1600-armv8.S | 538 +++---- sys/crypto/openssl/aarch64/poly1305-armv8.S | 202 +-- sys/crypto/openssl/aarch64/sha1-armv8.S | 86 +- sys/crypto/openssl/aarch64/sha256-armv8.S | 480 +++--- sys/crypto/openssl/aarch64/sha512-armv8.S | 357 ++--- sys/crypto/openssl/aarch64/vpaes-armv8.S | 348 ++--- sys/crypto/openssl/arm_arch.h | 16 + 14 files changed, 3513 insertions(+), 3286 deletions(-) diff --git a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S index 55856548fa6f..95a9b6e91916 100644 --- a/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S +++ b/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S @@ -2,7 +2,9 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=8 +#ifndef __CHERI_PURE_CAPABILITY__ .arch armv8-a+crypto +#endif .text .globl aes_gcm_enc_128_kernel .type aes_gcm_enc_128_kernel,%function @@ -10,38 +12,43 @@ aes_gcm_enc_128_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L128_enc_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] - - ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov PTR(16), PTR(4) + mov PTR(8), PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + + ldp x10, x11, [PTR(16)] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif - ldp x13, x14, [x8, #160] //load rk10 + ldp x13, x14, [PTR(8), #160] //load rk10 #ifdef __AARCH64EB__ ror x13, x13, #32 ror x14, x14, #32 #endif - ld1 {v11.16b}, [x3] + ld1 {v11.16b}, [PTR(3)] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b lsr x5, x1, #3 //byte_len mov x15, x5 - ld1 {v18.4s}, [x8], #16 //load rk0 - add x4, x0, x1, lsr #3 //end_input_ptr + ld1 {v18.4s}, [PTR(8)], #16 //load rk0 +#ifdef __CHERI_PURE_CAPABILITY__ + lsr x4, x1, #3 + add PTR(4), PTR(0), x4 //end_input_ptr +#else + add PTR(4), PTR(0), x1, lsr #3 //end_input_ptr +#endif sub x5, x5, #1 //byte_len - 1 lsr x12, x11, #32 - ldr q15, [x3, #112] //load h4l | h4h + ldr q15, [PTR(3), #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif @@ -50,14 +57,14 @@ aes_gcm_enc_128_kernel: add w12, w12, #1 //increment rev_ctr32 orr w11, w11, w11 - ld1 {v19.4s}, [x8], #16 //load rk1 + ld1 {v19.4s}, [PTR(8)], #16 //load rk1 rev w9, w12 //CTR block 1 add w12, w12, #1 //CTR block 1 fmov d3, x10 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 1 - ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + ld1 { v0.16b}, [PTR(16)] //special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 @@ -70,33 +77,33 @@ aes_gcm_enc_128_kernel: rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 - ld1 {v20.4s}, [x8], #16 //load rk2 + ld1 {v20.4s}, [PTR(8)], #16 //load rk2 add w12, w12, #1 //CTR block 3 fmov v3.d[1], x9 //CTR block 3 - ldr q14, [x3, #80] //load h3l | h3h + ldr q14, [PTR(3), #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ld1 {v21.4s}, [x8], #16 //load rk3 + ld1 {v21.4s}, [PTR(8)], #16 //load rk3 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ldr q12, [x3, #32] //load h1l | h1h + ldr q12, [PTR(3), #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 - ld1 {v22.4s}, [x8], #16 //load rk4 + ld1 {v22.4s}, [PTR(8)], #16 //load rk4 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 - ld1 {v23.4s}, [x8], #16 //load rk5 + ld1 {v23.4s}, [PTR(8)], #16 //load rk5 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 @@ -104,11 +111,11 @@ aes_gcm_enc_128_kernel: aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 - ld1 {v24.4s}, [x8], #16 //load rk6 + ld1 {v24.4s}, [PTR(8)], #16 //load rk6 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 - ld1 {v25.4s}, [x8], #16 //load rk7 + ld1 {v25.4s}, [PTR(8)], #16 //load rk7 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 @@ -116,11 +123,11 @@ aes_gcm_enc_128_kernel: aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 - ld1 {v26.4s}, [x8], #16 //load rk8 + ld1 {v26.4s}, [PTR(8)], #16 //load rk8 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 - ldr q13, [x3, #64] //load h2l | h2h + ldr q13, [PTR(3), #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif @@ -140,7 +147,7 @@ aes_gcm_enc_128_kernel: aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 - ld1 {v27.4s}, [x8], #16 //load rk9 + ld1 {v27.4s}, [PTR(8)], #16 //load rk9 aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 @@ -154,7 +161,7 @@ aes_gcm_enc_128_kernel: aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 2 - round 4 - cmp x0, x5 //check if we have <= 4 blocks + cmp x0, x5 //check if we have <= 4 blocks aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 @@ -222,22 +229,22 @@ aes_gcm_enc_128_kernel: aese v3.16b, v27.16b //AES block 3 - round 9 b.ge .L128_enc_tail //handle tail - ldp x6, x7, [x0, #0] //AES block 0 - load plaintext + ldp x6, x7, [PTR(0), #0] //AES block 0 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif - ldp x21, x22, [x0, #32] //AES block 2 - load plaintext + ldp x21, x22, [PTR(0), #32] //AES block 2 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif - ldp x19, x20, [x0, #16] //AES block 1 - load plaintext + ldp x19, x20, [PTR(0), #16] //AES block 1 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif - ldp x23, x24, [x0, #48] //AES block 3 - load plaintext + ldp x23, x24, [PTR(0), #48] //AES block 3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 @@ -277,35 +284,35 @@ aes_gcm_enc_128_kernel: orr x9, x11, x9, lsl #32 //CTR block 5 add w12, w12, #1 //CTR block 5 - add x0, x0, #64 //AES input_ptr update + add PTR(0), PTR(0), #64 //AES input_ptr update fmov v1.d[1], x9 //CTR block 5 fmov d7, x23 //AES block 3 - mov low rev w9, w12 //CTR block 6 - st1 { v4.16b}, [x2], #16 //AES block 0 - store result + st1 { v4.16b}, [PTR(2)], #16 //AES block 0 - store result fmov v7.d[1], x24 //AES block 3 - mov high orr x9, x11, x9, lsl #32 //CTR block 6 add w12, w12, #1 //CTR block 6 eor v6.16b, v6.16b, v2.16b //AES block 2 - result - st1 { v5.16b}, [x2], #16 //AES block 1 - store result + st1 { v5.16b}, [PTR(2)], #16 //AES block 1 - store result fmov d2, x10 //CTR block 6 - cmp x0, x5 //check if we have <= 8 blocks + cmp x0, x5 //check if we have <= 8 blocks fmov v2.d[1], x9 //CTR block 6 rev w9, w12 //CTR block 7 - st1 { v6.16b}, [x2], #16 //AES block 2 - store result + st1 { v6.16b}, [PTR(2)], #16 //AES block 2 - store result orr x9, x11, x9, lsl #32 //CTR block 7 eor v7.16b, v7.16b, v3.16b //AES block 3 - result - st1 { v7.16b}, [x2], #16 //AES block 3 - store result + st1 { v7.16b}, [PTR(2)], #16 //AES block 3 - store result b.ge .L128_enc_prepretail //do prepretail .L128_enc_main_loop: //main loop start - ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext + ldp x23, x24, [PTR(0), #48] //AES block 4k+3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 @@ -343,7 +350,7 @@ aes_gcm_enc_128_kernel: pmull2 v28.1q, v5.2d, v14.2d //GHASH block 4k+1 - high eor v31.8b, v31.8b, v6.8b //GHASH block 4k+2 - mid - ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext + ldp x6, x7, [PTR(0), #0] //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -428,7 +435,7 @@ aes_gcm_enc_128_kernel: aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 - ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext + ldp x19, x20, [PTR(0), #16] //AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 @@ -439,7 +446,7 @@ aes_gcm_enc_128_kernel: aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 5 - ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext + ldp x21, x22, [PTR(0), #32] //AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 @@ -468,7 +475,7 @@ aes_gcm_enc_128_kernel: aesmc v0.16b, v0.16b //AES block 4k+4 - round 6 fmov v4.d[1], x7 //AES block 4k+4 - mov high - add x0, x0, #64 //AES input_ptr update + add PTR(0), PTR(0), #64 //AES input_ptr update fmov d7, x23 //AES block 4k+3 - mov low ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment @@ -494,7 +501,7 @@ aes_gcm_enc_128_kernel: aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 - cmp x0, x5 //.LOOP CONTROL + cmp x0, x5 //.LOOP CONTROL aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 @@ -536,7 +543,7 @@ aes_gcm_enc_128_kernel: rev w9, w12 //CTR block 4k+10 aese v2.16b, v27.16b //AES block 4k+6 - round 9 - st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + st1 { v4.16b}, [PTR(2)], #16 //AES block 4k+4 - store result eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result orr x9, x11, x9, lsl #32 //CTR block 4k+10 @@ -546,17 +553,17 @@ aes_gcm_enc_128_kernel: fmov d2, x10 //CTR block 4k+10 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low - st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + st1 { v5.16b}, [PTR(2)], #16 //AES block 4k+5 - store result fmov v2.d[1], x9 //CTR block 4k+10 - st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + st1 { v6.16b}, [PTR(2)], #16 //AES block 4k+6 - store result rev w9, w12 //CTR block 4k+11 orr x9, x11, x9, lsl #32 //CTR block 4k+11 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result eor v11.16b, v11.16b, v10.16b //MODULO - fold into low - st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result + st1 { v7.16b}, [PTR(2)], #16 //AES block 4k+3 - store result b.lt .L128_enc_main_loop .L128_enc_prepretail: //PREPRETAIL @@ -750,8 +757,8 @@ aes_gcm_enc_128_kernel: aese v2.16b, v27.16b //AES block 4k+6 - round 9 .L128_enc_tail: //TAIL - sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process - ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [PTR(0)], #16 //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -790,9 +797,9 @@ aes_gcm_enc_128_kernel: sub w12, w12, #1 b .L128_enc_blocks_less_than_1 .L128_enc_blocks_more_than_3: //blocks left > 3 - st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-3 block - store result - ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -821,10 +828,10 @@ aes_gcm_enc_128_kernel: pmull v10.1q, v22.1d, v10.1d //GHASH final-3 block - mid .L128_enc_blocks_more_than_2: //blocks left > 2 - st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-2 block - store result rev64 v4.16b, v5.16b //GHASH final-2 block - ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -858,10 +865,10 @@ aes_gcm_enc_128_kernel: eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid .L128_enc_blocks_more_than_1: //blocks left > 1 - st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-1 block - store result rev64 v4.16b, v5.16b //GHASH final-1 block - ldp x6, x7, [x0], #16 //AES final block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -925,7 +932,7 @@ aes_gcm_enc_128_kernel: mov d8, v4.d[1] //GHASH final block - mid pmull v21.1q, v4.1d, v12.1d //GHASH final block - low - ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + ld1 { v18.16b}, [PTR(2)] //load existing bytes where the possibly partial last block is to be stored eor v8.8b, v8.8b, v4.8b //GHASH final block - mid #ifndef __AARCH64EB__ @@ -965,22 +972,22 @@ aes_gcm_enc_128_kernel: bif v5.16b, v18.16b, v0.16b //insert existing bytes in top end of result before storing eor v11.16b, v11.16b, v9.16b //MODULO - fold into low - st1 { v5.16b}, [x2] //store all 16B + st1 { v5.16b}, [PTR(2)] //store all 16B - str w9, [x16, #12] //store the updated counter + str w9, [PTR(16), #12] //store the updated counter eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 - st1 { v11.16b }, [x3] - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + st1 { v11.16b }, [PTR(3)] + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L128_enc_ret: @@ -993,42 +1000,42 @@ aes_gcm_enc_128_kernel: aes_gcm_dec_128_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L128_dec_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov PTR(16), PTR(4) + mov PTR(8), PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] lsr x5, x1, #3 //byte_len mov x15, x5 - ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 + ldp x10, x11, [PTR(16)] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif - ldp x13, x14, [x8, #160] //load rk10 + ldp x13, x14, [PTR(8), #160] //load rk10 #ifdef __AARCH64EB__ ror x14, x14, 32 ror x13, x13, 32 #endif sub x5, x5, #1 //byte_len - 1 - ld1 {v18.4s}, [x8], #16 //load rk0 + ld1 {v18.4s}, [PTR(8)], #16 //load rk0 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) - ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + ld1 { v0.16b}, [PTR(16)] //special case vector load initial counter so we can start first AES block as quickly as possible - ldr q13, [x3, #64] //load h2l | h2h + ldr q13, [PTR(3), #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif lsr x12, x11, #32 fmov d2, x10 //CTR block 2 - ld1 {v19.4s}, [x8], #16 //load rk1 + ld1 {v19.4s}, [PTR(8)], #16 //load rk1 orr w11, w11, w11 rev w12, w12 //rev_ctr32 @@ -1040,7 +1047,7 @@ aes_gcm_dec_128_kernel: rev w9, w12 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 - ld1 {v20.4s}, [x8], #16 //load rk2 + ld1 {v20.4s}, [PTR(8)], #16 //load rk2 add w12, w12, #1 //CTR block 1 fmov v1.d[1], x9 //CTR block 1 @@ -1059,23 +1066,28 @@ aes_gcm_dec_128_kernel: add w12, w12, #1 //CTR block 3 fmov v3.d[1], x9 //CTR block 3 - add x4, x0, x1, lsr #3 //end_input_ptr +#ifdef __CHERI_PURE_CAPABILITY__ + lsr x4, x1, #3 + add PTR(4), PTR(0), x4 //end_input_ptr +#else + add PTR(4), PTR(0), x1, lsr #3 //end_input_ptr +#endif aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ld1 {v21.4s}, [x8], #16 //load rk3 + ld1 {v21.4s}, [PTR(8)], #16 //load rk3 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 - ld1 {v22.4s}, [x8], #16 //load rk4 + ld1 {v22.4s}, [PTR(8)], #16 //load rk4 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ld1 {v23.4s}, [x8], #16 //load rk5 + ld1 {v23.4s}, [PTR(8)], #16 //load rk5 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 - ld1 {v24.4s}, [x8], #16 //load rk6 + ld1 {v24.4s}, [PTR(8)], #16 //load rk6 aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 @@ -1088,13 +1100,13 @@ aes_gcm_dec_128_kernel: aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 - ld1 { v11.16b}, [x3] + ld1 { v11.16b}, [PTR(3)] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 - ld1 {v25.4s}, [x8], #16 //load rk7 + ld1 {v25.4s}, [PTR(8)], #16 //load rk7 aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 @@ -1104,7 +1116,7 @@ aes_gcm_dec_128_kernel: aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 - ld1 {v26.4s}, [x8], #16 //load rk8 + ld1 {v26.4s}, [PTR(8)], #16 //load rk8 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 @@ -1114,13 +1126,13 @@ aes_gcm_dec_128_kernel: aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 - ldr q14, [x3, #80] //load h3l | h3h + ldr q14, [PTR(3), #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 - ld1 {v27.4s}, [x8], #16 //load rk9 + ld1 {v27.4s}, [PTR(8)], #16 //load rk9 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 @@ -1136,7 +1148,7 @@ aes_gcm_dec_128_kernel: aese v2.16b, v23.16b aesmc v2.16b, v2.16b //AES block 2 - round 5 - ldr q12, [x3, #32] //load h1l | h1h + ldr q12, [PTR(3), #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif @@ -1156,7 +1168,7 @@ aes_gcm_dec_128_kernel: aesmc v2.16b, v2.16b //AES block 2 - round 6 trn1 v8.2d, v12.2d, v13.2d //h2h | h1h - ldr q15, [x3, #112] //load h4l | h4h + ldr q15, [PTR(3), #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif @@ -1195,16 +1207,16 @@ aes_gcm_dec_128_kernel: aese v3.16b, v27.16b //AES block 3 - round 9 aese v0.16b, v27.16b //AES block 0 - round 9 - cmp x0, x5 //check if we have <= 4 blocks + cmp x0, x5 //check if we have <= 4 blocks aese v1.16b, v27.16b //AES block 1 - round 9 eor v17.16b, v17.16b, v9.16b //h4k | h3k b.ge .L128_dec_tail //handle tail - ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext + ld1 {v4.16b, v5.16b}, [PTR(0)], #32 //AES block 0 - load ciphertext; AES block 1 - load ciphertext eor v1.16b, v5.16b, v1.16b //AES block 1 - result - ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext + ld1 {v6.16b}, [PTR(0)], #16 //AES block 2 - load ciphertext eor v0.16b, v4.16b, v0.16b //AES block 0 - result rev64 v4.16b, v4.16b //GHASH block 0 @@ -1212,7 +1224,7 @@ aes_gcm_dec_128_kernel: orr x9, x11, x9, lsl #32 //CTR block 4 add w12, w12, #1 //CTR block 4 - ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext + ld1 {v7.16b}, [PTR(0)], #16 //AES block 3 - load ciphertext rev64 v5.16b, v5.16b //GHASH block 1 mov x19, v1.d[0] //AES block 1 - mov low @@ -1220,7 +1232,7 @@ aes_gcm_dec_128_kernel: mov x20, v1.d[1] //AES block 1 - mov high mov x6, v0.d[0] //AES block 0 - mov low - cmp x0, x5 //check if we have <= 8 blocks + cmp x0, x5 //check if we have <= 8 blocks mov x7, v0.d[1] //AES block 0 - mov high @@ -1256,9 +1268,9 @@ aes_gcm_dec_128_kernel: #ifdef __AARCH64EB__ rev x7, x7 #endif - stp x6, x7, [x2], #16 //AES block 0 - store result + stp x6, x7, [PTR(2)], #16 //AES block 0 - store result - stp x19, x20, [x2], #16 //AES block 1 - store result + stp x19, x20, [PTR(2)], #16 //AES block 1 - store result b.ge .L128_dec_prepretail //do prepretail .L128_dec_main_loop: //main loop start @@ -1398,11 +1410,11 @@ aes_gcm_dec_128_kernel: aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 - stp x21, x22, [x2], #16 //AES block 4k+2 - store result + stp x21, x22, [PTR(2)], #16 //AES block 4k+2 - store result pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid eor v9.16b, v9.16b, v4.16b //GHASH block 4k+3 - high - ld1 {v4.16b}, [x0], #16 //AES block 4k+3 - load ciphertext + ld1 {v4.16b}, [PTR(0)], #16 //AES block 4k+3 - load ciphertext aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 7 @@ -1418,7 +1430,7 @@ aes_gcm_dec_128_kernel: aese v1.16b, v26.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 8 - stp x23, x24, [x2], #16 //AES block 4k+3 - store result + stp x23, x24, [PTR(2)], #16 //AES block 4k+3 - store result aese v0.16b, v26.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 8 @@ -1429,7 +1441,7 @@ aes_gcm_dec_128_kernel: rev w9, w12 //CTR block 4k+8 pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid - ld1 {v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + ld1 {v5.16b}, [PTR(0)], #16 //AES block 4k+4 - load ciphertext ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment aese v0.16b, v27.16b //AES block 4k+4 - round 9 @@ -1447,7 +1459,7 @@ aes_gcm_dec_128_kernel: aese v3.16b, v23.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 5 - ld1 {v6.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + ld1 {v6.16b}, [PTR(0)], #16 //AES block 4k+5 - load ciphertext add w12, w12, #1 //CTR block 4k+8 eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid @@ -1455,7 +1467,7 @@ aes_gcm_dec_128_kernel: aese v2.16b, v25.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 7 - ld1 {v7.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + ld1 {v7.16b}, [PTR(0)], #16 //AES block 4k+6 - load ciphertext aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 @@ -1498,7 +1510,7 @@ aes_gcm_dec_128_kernel: aese v3.16b, v27.16b //AES block 4k+7 - round 9 fmov d1, x10 //CTR block 4k+9 - cmp x0, x5 //.LOOP CONTROL + cmp x0, x5 //.LOOP CONTROL rev64 v4.16b, v4.16b //GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low @@ -1511,13 +1523,13 @@ aes_gcm_dec_128_kernel: #ifdef __AARCH64EB__ rev x20, x20 #endif - stp x6, x7, [x2], #16 //AES block 4k+4 - store result + stp x6, x7, [PTR(2)], #16 //AES block 4k+4 - store result eor x19, x19, x13 //AES block 4k+5 - round 10 low #ifdef __AARCH64EB__ rev x19, x19 #endif - stp x19, x20, [x2], #16 //AES block 4k+5 - store result + stp x19, x20, [PTR(2)], #16 //AES block 4k+5 - store result orr x9, x11, x9, lsl #32 //CTR block 4k+10 b.lt .L128_dec_main_loop @@ -1725,18 +1737,18 @@ aes_gcm_dec_128_kernel: rev x22, x22 #endif aese v0.16b, v27.16b //AES block 4k+4 - round 9 - stp x21, x22, [x2], #16 //AES block 4k+2 - store result + stp x21, x22, [PTR(2)], #16 //AES block 4k+2 - store result aese v2.16b, v27.16b //AES block 4k+6 - round 9 add w12, w12, #1 //CTR block 4k+7 - stp x23, x24, [x2], #16 //AES block 4k+3 - store result + stp x23, x24, [PTR(2)], #16 //AES block 4k+3 - store result aese v3.16b, v27.16b //AES block 4k+7 - round 9 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low .L128_dec_tail: //TAIL - sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process - ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [PTR(0)], #16 //AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result @@ -1778,12 +1790,12 @@ aes_gcm_dec_128_kernel: b .L128_dec_blocks_less_than_1 .L128_dec_blocks_more_than_3: //blocks left > 3 rev64 v4.16b, v5.16b //GHASH final-3 block - ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final-2 block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag mov d10, v17.d[1] //GHASH final-3 block - mid - stp x6, x7, [x2], #16 //AES final-3 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-3 block - store result eor v0.16b, v5.16b, v1.16b //AES final-2 block - result mov d22, v4.d[1] //GHASH final-3 block - mid @@ -1809,12 +1821,12 @@ aes_gcm_dec_128_kernel: .L128_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block - ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag eor v0.16b, v5.16b, v2.16b //AES final-1 block - result - stp x6, x7, [x2], #16 //AES final-2 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-2 block - store result mov d22, v4.d[1] //GHASH final-2 block - mid @@ -1847,7 +1859,7 @@ aes_gcm_dec_128_kernel: rev64 v4.16b, v5.16b //GHASH final-1 block - ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag mov d22, v4.d[1] //GHASH final-1 block - mid @@ -1856,7 +1868,7 @@ aes_gcm_dec_128_kernel: eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid - stp x6, x7, [x2], #16 //AES final-1 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-1 block - store result mov x6, v0.d[0] //AES final block - mov low mov x7, v0.d[1] //AES final block - mov high @@ -1909,7 +1921,7 @@ aes_gcm_dec_128_kernel: eor v4.16b, v4.16b, v8.16b //feed in partial tag - ldp x4, x5, [x2] //load existing bytes we need to not overwrite + ldp x4, x5, [PTR(2)] //load existing bytes we need to not overwrite and x7, x7, x10 @@ -1922,7 +1934,7 @@ aes_gcm_dec_128_kernel: pmull v8.1q, v8.1d, v16.1d //GHASH final block - mid pmull v21.1q, v4.1d, v12.1d //GHASH final block - low - bic x4, x4, x9 //mask out low existing bytes + bic x4, x4, x9 //mask out low existing bytes and x6, x6, x9 #ifndef __AARCH64EB__ @@ -1946,10 +1958,10 @@ aes_gcm_dec_128_kernel: eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up orr x6, x6, x4 - str w9, [x16, #12] //store the updated counter + str w9, [PTR(16), #12] //store the updated counter orr x7, x7, x5 - stp x6, x7, [x2] + stp x6, x7, [PTR(2)] ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment eor v10.16b, v10.16b, v31.16b //MODULO - fold into mid @@ -1965,15 +1977,15 @@ aes_gcm_dec_128_kernel: ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 - st1 { v11.16b }, [x3] - - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + st1 { v11.16b }, [PTR(3)] + + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L128_dec_ret: @@ -1986,37 +1998,37 @@ aes_gcm_dec_128_kernel: aes_gcm_enc_192_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L192_enc_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] - - ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov PTR(16), PTR(4) + mov PTR(8), PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + + ldp x10, x11, [PTR(16)] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif - ldp x13, x14, [x8, #192] //load rk12 + ldp x13, x14, [PTR(8), #192] //load rk12 #ifdef __AARCH64EB__ ror x13, x13, #32 ror x14, x14, #32 #endif - ld1 {v18.4s}, [x8], #16 //load rk0 + ld1 {v18.4s}, [PTR(8)], #16 //load rk0 - ld1 {v19.4s}, [x8], #16 //load rk1 + ld1 {v19.4s}, [PTR(8)], #16 //load rk1 - ld1 {v20.4s}, [x8], #16 //load rk2 + ld1 {v20.4s}, [PTR(8)], #16 //load rk2 lsr x12, x11, #32 - ld1 {v21.4s}, [x8], #16 //load rk3 + ld1 {v21.4s}, [PTR(8)], #16 //load rk3 orr w11, w11, w11 - ld1 {v22.4s}, [x8], #16 //load rk4 + ld1 {v22.4s}, [PTR(8)], #16 //load rk4 rev w12, w12 //rev_ctr32 add w12, w12, #1 //increment rev_ctr32 @@ -2027,7 +2039,7 @@ aes_gcm_enc_192_kernel: fmov d1, x10 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 - ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + ld1 { v0.16b}, [PTR(16)] //special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 @@ -2040,51 +2052,51 @@ aes_gcm_enc_192_kernel: rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 - ld1 {v23.4s}, [x8], #16 //load rk5 + ld1 {v23.4s}, [PTR(8)], #16 //load rk5 fmov v3.d[1], x9 //CTR block 3 - ld1 {v24.4s}, [x8], #16 //load rk6 + ld1 {v24.4s}, [PTR(8)], #16 //load rk6 - ld1 {v25.4s}, [x8], #16 //load rk7 + ld1 {v25.4s}, [PTR(8)], #16 //load rk7 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 - ld1 { v11.16b}, [x3] + ld1 { v11.16b}, [PTR(3)] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 - ld1 {v26.4s}, [x8], #16 //load rk8 + ld1 {v26.4s}, [PTR(8)], #16 //load rk8 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ldr q15, [x3, #112] //load h4l | h4h + ldr q15, [PTR(3), #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ld1 {v27.4s}, [x8], #16 //load rk9 + ld1 {v27.4s}, [PTR(8)], #16 //load rk9 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 - ld1 {v28.4s}, [x8], #16 //load rk10 + ld1 {v28.4s}, [PTR(8)], #16 //load rk10 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 - ldr q12, [x3, #32] //load h1l | h1h + ldr q12, [PTR(3), #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 - ld1 {v29.4s}, [x8], #16 //load rk11 + ld1 {v29.4s}, [PTR(8)], #16 //load rk11 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 - ldr q14, [x3, #80] //load h3l | h3h + ldr q14, [PTR(3), #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif @@ -2143,7 +2155,7 @@ aes_gcm_enc_192_kernel: aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 - ldr q13, [x3, #64] //load h2l | h2h + ldr q13, [PTR(3), #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif @@ -2212,11 +2224,16 @@ aes_gcm_enc_192_kernel: eor v17.16b, v17.16b, v9.16b //h4k | h3k aese v2.16b, v29.16b //AES block 2 - round 11 - add x4, x0, x1, lsr #3 //end_input_ptr +#ifdef __CHERI_PURE_CAPABILITY__ + lsr x4, x1, #3 + add PTR(4), PTR(0), x4 //end_input_ptr +#else + add PTR(4), PTR(0), x1, lsr #3 //end_input_ptr +#endif add x5, x5, x0 aese v1.16b, v29.16b //AES block 1 - round 11 - cmp x0, x5 //check if we have <= 4 blocks + cmp x0, x5 //check if we have <= 4 blocks aese v0.16b, v29.16b //AES block 0 - round 11 add w12, w12, #1 //CTR block 3 @@ -2225,29 +2242,29 @@ aes_gcm_enc_192_kernel: b.ge .L192_enc_tail //handle tail rev w9, w12 //CTR block 4 - ldp x6, x7, [x0, #0] //AES block 0 - load plaintext + ldp x6, x7, [PTR(0), #0] //AES block 0 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif orr x9, x11, x9, lsl #32 //CTR block 4 - ldp x21, x22, [x0, #32] //AES block 2 - load plaintext + ldp x21, x22, [PTR(0), #32] //AES block 2 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif - ldp x23, x24, [x0, #48] //AES block 3 - load plaintext + ldp x23, x24, [PTR(0), #48] //AES block 3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif - ldp x19, x20, [x0, #16] //AES block 1 - load plaintext + ldp x19, x20, [PTR(0), #16] //AES block 1 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif - add x0, x0, #64 //AES input_ptr update - cmp x0, x5 //check if we have <= 8 blocks + add PTR(0), PTR(0), #64 //AES input_ptr update + cmp x0, x5 //check if we have <= 8 blocks eor x6, x6, x13 //AES block 0 - round 12 low @@ -2280,13 +2297,13 @@ aes_gcm_enc_192_kernel: add w12, w12, #1 //CTR block 5 fmov d7, x23 //AES block 3 - mov low - st1 { v4.16b}, [x2], #16 //AES block 0 - store result + st1 { v4.16b}, [PTR(2)], #16 //AES block 0 - store result fmov v6.d[1], x22 //AES block 2 - mov high eor v5.16b, v5.16b, v1.16b //AES block 1 - result fmov d1, x10 //CTR block 5 - st1 { v5.16b}, [x2], #16 //AES block 1 - store result + st1 { v5.16b}, [PTR(2)], #16 //AES block 1 - store result fmov v7.d[1], x24 //AES block 3 - mov high @@ -2303,10 +2320,10 @@ aes_gcm_enc_192_kernel: rev w9, w12 //CTR block 7 orr x9, x11, x9, lsl #32 //CTR block 7 - st1 { v6.16b}, [x2], #16 //AES block 2 - store result + st1 { v6.16b}, [PTR(2)], #16 //AES block 2 - store result eor v7.16b, v7.16b, v3.16b //AES block 3 - result - st1 { v7.16b}, [x2], #16 //AES block 3 - store result + st1 { v7.16b}, [PTR(2)], #16 //AES block 3 - store result b.ge .L192_enc_prepretail //do prepretail .L192_enc_main_loop: //main loop start @@ -2316,7 +2333,7 @@ aes_gcm_enc_192_kernel: aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 0 - ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext + ldp x19, x20, [PTR(0), #16] //AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 @@ -2331,14 +2348,14 @@ aes_gcm_enc_192_kernel: pmull2 v30.1q, v5.2d, v14.2d //GHASH block 4k+1 - high rev64 v7.16b, v7.16b //GHASH block 4k+3 (t0, t1, t2 and t3 free) - ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext + ldp x21, x22, [PTR(0), #32] //AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 0 - ldp x23, x24, [x0, #48] //AES block 4k+3 - load plaintext + ldp x23, x24, [PTR(0), #48] //AES block 4k+3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 @@ -2434,7 +2451,7 @@ aes_gcm_enc_192_kernel: aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 4 - ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext + ldp x6, x7, [PTR(0), #0] //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -2445,7 +2462,7 @@ aes_gcm_enc_192_kernel: aese v2.16b, v22.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 4 - add x0, x0, #64 //AES input_ptr update + add PTR(0), PTR(0), #64 //AES input_ptr update aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 5 @@ -2484,7 +2501,7 @@ aes_gcm_enc_192_kernel: eor v11.16b, v11.16b, v6.16b //GHASH block 4k+3 - low pmull v30.1q, v30.1d, v16.1d //GHASH block 4k+3 - mid - cmp x0, x5 //.LOOP CONTROL + cmp x0, x5 //.LOOP CONTROL fmov d4, x6 //AES block 4k+4 - mov low aese v2.16b, v24.16b @@ -2551,7 +2568,7 @@ aes_gcm_enc_192_kernel: pmull v9.1q, v10.1d, v8.1d //MODULO - mid 64b align with low fmov v6.d[1], x22 //AES block 4k+6 - mov high - st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + st1 { v4.16b}, [PTR(2)], #16 //AES block 4k+4 - store result aese v3.16b, v28.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 10 @@ -2569,14 +2586,14 @@ aes_gcm_enc_192_kernel: ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment orr x9, x11, x9, lsl #32 //CTR block 4k+10 - st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + st1 { v5.16b}, [PTR(2)], #16 //AES block 4k+5 - store result eor v11.16b, v11.16b, v9.16b //MODULO - fold into low aese v3.16b, v29.16b //AES block 4k+7 - round 11 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result fmov d2, x10 //CTR block 4k+10 - st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + st1 { v6.16b}, [PTR(2)], #16 //AES block 4k+6 - store result fmov v2.d[1], x9 //CTR block 4k+10 rev w9, w12 //CTR block 4k+11 @@ -2584,7 +2601,7 @@ aes_gcm_enc_192_kernel: orr x9, x11, x9, lsl #32 //CTR block 4k+11 eor v7.16b, v7.16b, v3.16b //AES block 4k+3 - result - st1 { v7.16b}, [x2], #16 //AES block 4k+3 - store result + st1 { v7.16b}, [PTR(2)], #16 //AES block 4k+3 - store result b.lt .L192_enc_main_loop .L192_enc_prepretail: //PREPRETAIL @@ -2801,8 +2818,8 @@ aes_gcm_enc_192_kernel: eor v11.16b, v11.16b, v10.16b .L192_enc_tail: //TAIL - sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process - ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [PTR(0)], #16 //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -2840,9 +2857,9 @@ aes_gcm_enc_192_kernel: sub w12, w12, #1 b .L192_enc_blocks_less_than_1 .L192_enc_blocks_more_than_3: //blocks left > 3 - st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-3 block - store result - ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -2873,10 +2890,10 @@ aes_gcm_enc_192_kernel: eor v5.16b, v5.16b, v1.16b //AES final-2 block - result .L192_enc_blocks_more_than_2: //blocks left > 2 - st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-2 block - store result rev64 v4.16b, v5.16b //GHASH final-2 block - ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -2908,9 +2925,9 @@ aes_gcm_enc_192_kernel: eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid .L192_enc_blocks_more_than_1: //blocks left > 1 - st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-1 block - store result - ldp x6, x7, [x0], #16 //AES final block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -2945,7 +2962,7 @@ aes_gcm_enc_192_kernel: eor v10.16b, v10.16b, v22.16b //GHASH final-1 block - mid .L192_enc_blocks_less_than_1: //blocks left <= 1 - ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + ld1 { v18.16b}, [PTR(2)] //load existing bytes where the possibly partial last block is to be stored #ifndef __AARCH64EB__ rev w9, w12 #else @@ -3015,23 +3032,23 @@ aes_gcm_enc_192_kernel: ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment eor v11.16b, v11.16b, v9.16b //MODULO - fold into low - str w9, [x16, #12] //store the updated counter + str w9, [PTR(16), #12] //store the updated counter - st1 { v5.16b}, [x2] //store all 16B + st1 { v5.16b}, [PTR(2)] //store all 16B eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 - st1 { v11.16b }, [x3] - - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + st1 { v11.16b }, [PTR(3)] + + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L192_enc_ret: @@ -3044,34 +3061,39 @@ aes_gcm_enc_192_kernel: aes_gcm_dec_192_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L192_dec_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] - - add x4, x0, x1, lsr #3 //end_input_ptr - ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov PTR(16), PTR(4) + mov PTR(8), PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + +#ifdef __CHERI_PURE_CAPABILITY__ + lsr x4, x1, #3 + add PTR(4), PTR(0), x4 //end_input_ptr +#else + add PTR(4), PTR(0), x1, lsr #3 //end_input_ptr +#endif + ldp x10, x11, [PTR(16)] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif - ldp x13, x14, [x8, #192] //load rk12 + ldp x13, x14, [PTR(8), #192] //load rk12 #ifdef __AARCH64EB__ ror x13, x13, #32 ror x14, x14, #32 #endif - ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + ld1 { v0.16b}, [PTR(16)] //special case vector load initial counter so we can start first AES block as quickly as possible - ld1 {v18.4s}, [x8], #16 //load rk0 + ld1 {v18.4s}, [PTR(8)], #16 //load rk0 lsr x5, x1, #3 //byte_len mov x15, x5 - ld1 {v19.4s}, [x8], #16 //load rk1 + ld1 {v19.4s}, [PTR(8)], #16 //load rk1 lsr x12, x11, #32 orr w11, w11, w11 @@ -3081,7 +3103,7 @@ aes_gcm_dec_192_kernel: fmov d1, x10 //CTR block 1 add w12, w12, #1 //increment rev_ctr32 - ld1 {v20.4s}, [x8], #16 //load rk2 + ld1 {v20.4s}, [PTR(8)], #16 //load rk2 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 @@ -3089,7 +3111,7 @@ aes_gcm_dec_192_kernel: add w12, w12, #1 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 - ld1 {v21.4s}, [x8], #16 //load rk3 + ld1 {v21.4s}, [PTR(8)], #16 //load rk3 fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 @@ -3107,30 +3129,30 @@ aes_gcm_dec_192_kernel: fmov v3.d[1], x9 //CTR block 3 - ld1 {v22.4s}, [x8], #16 //load rk4 + ld1 {v22.4s}, [PTR(8)], #16 //load rk4 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ld1 {v23.4s}, [x8], #16 //load rk5 + ld1 {v23.4s}, [PTR(8)], #16 //load rk5 aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ldr q15, [x3, #112] //load h4l | h4h + ldr q15, [PTR(3), #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 - ldr q13, [x3, #64] //load h2l | h2h + ldr q13, [PTR(3), #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 - ldr q14, [x3, #80] //load h3l | h3h + ldr q14, [PTR(3), #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif @@ -3139,29 +3161,29 @@ aes_gcm_dec_192_kernel: aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 - ldr q12, [x3, #32] //load h1l | h1h + ldr q12, [PTR(3), #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 - ld1 {v24.4s}, [x8], #16 //load rk6 + ld1 {v24.4s}, [PTR(8)], #16 //load rk6 aese v0.16b, v21.16b aesmc v0.16b, v0.16b //AES block 0 - round 3 - ld1 {v25.4s}, [x8], #16 //load rk7 + ld1 {v25.4s}, [PTR(8)], #16 //load rk7 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 - ld1 {v26.4s}, [x8], #16 //load rk8 + ld1 {v26.4s}, [PTR(8)], #16 //load rk8 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 - ld1 {v27.4s}, [x8], #16 //load rk9 + ld1 {v27.4s}, [PTR(8)], #16 //load rk9 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 - ld1 { v11.16b}, [x3] + ld1 { v11.16b}, [PTR(3)] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b @@ -3175,7 +3197,7 @@ aes_gcm_dec_192_kernel: aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 - ld1 {v28.4s}, [x8], #16 //load rk10 + ld1 {v28.4s}, [PTR(8)], #16 //load rk10 aese v1.16b, v22.16b aesmc v1.16b, v1.16b //AES block 1 - round 4 @@ -3190,7 +3212,7 @@ aes_gcm_dec_192_kernel: aese v0.16b, v23.16b aesmc v0.16b, v0.16b //AES block 0 - round 5 - ld1 {v29.4s}, [x8], #16 //load rk11 + ld1 {v29.4s}, [PTR(8)], #16 //load rk11 aese v1.16b, v23.16b aesmc v1.16b, v1.16b //AES block 1 - round 5 @@ -3251,7 +3273,7 @@ aes_gcm_dec_192_kernel: aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 - cmp x0, x5 //check if we have <= 4 blocks + cmp x0, x5 //check if we have <= 4 blocks aese v0.16b, v27.16b aesmc v0.16b, v0.16b //AES block 0 - round 9 @@ -3277,13 +3299,13 @@ aes_gcm_dec_192_kernel: aese v0.16b, v29.16b //AES block 0 - round 11 b.ge .L192_dec_tail //handle tail - ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext + ld1 {v4.16b, v5.16b}, [PTR(0)], #32 //AES block 0,1 - load ciphertext eor v1.16b, v5.16b, v1.16b //AES block 1 - result eor v0.16b, v4.16b, v0.16b //AES block 0 - result rev w9, w12 //CTR block 4 - ld1 {v6.16b, v7.16b}, [x0], #32 //AES block 2,3 - load ciphertext + ld1 {v6.16b, v7.16b}, [PTR(0)], #32 //AES block 2,3 - load ciphertext mov x19, v1.d[0] //AES block 1 - mov low @@ -3298,7 +3320,7 @@ aes_gcm_dec_192_kernel: fmov d0, x10 //CTR block 4 rev64 v5.16b, v5.16b //GHASH block 1 - cmp x0, x5 //check if we have <= 8 blocks + cmp x0, x5 //check if we have <= 8 blocks eor x19, x19, x13 //AES block 1 - round 12 low #ifdef __AARCH64EB__ @@ -3324,10 +3346,10 @@ aes_gcm_dec_192_kernel: #ifdef __AARCH64EB__ rev x7, x7 #endif - stp x6, x7, [x2], #16 //AES block 0 - store result + stp x6, x7, [PTR(2)], #16 //AES block 0 - store result orr x9, x11, x9, lsl #32 //CTR block 6 - stp x19, x20, [x2], #16 //AES block 1 - store result + stp x19, x20, [PTR(2)], #16 //AES block 1 - store result add w12, w12, #1 //CTR block 6 eor v2.16b, v6.16b, v2.16b //AES block 2 - result @@ -3509,14 +3531,14 @@ aes_gcm_dec_192_kernel: aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 6 - ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + ld1 {v4.16b}, [PTR(0)], #16 //AES block 4k+4 - load ciphertext aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 eor v10.16b, v10.16b, v30.16b //MODULO - karatsuba tidy up pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid - ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + ld1 {v5.16b}, [PTR(0)], #16 //AES block 4k+5 - load ciphertext eor x23, x23, x13 //AES block 4k+3 - round 12 low #ifdef __AARCH64EB__ rev x23, x23 @@ -3534,21 +3556,21 @@ aes_gcm_dec_192_kernel: aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 - ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + ld1 {v6.16b}, [PTR(0)], #16 //AES block 4k+6 - load ciphertext aese v1.16b, v29.16b //AES block 4k+5 - round 11 - ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext + ld1 {v7.16b}, [PTR(0)], #16 //AES block 4k+7 - load ciphertext rev w9, w12 //CTR block 4k+8 aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 - stp x21, x22, [x2], #16 //AES block 4k+2 - store result + stp x21, x22, [PTR(2)], #16 //AES block 4k+2 - store result aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 eor v10.16b, v10.16b, v9.16b //MODULO - fold into mid - cmp x0, x5 //.LOOP CONTROL + cmp x0, x5 //.LOOP CONTROL eor v0.16b, v4.16b, v0.16b //AES block 4k+4 - result eor x24, x24, x14 //AES block 4k+3 - round 12 high @@ -3568,7 +3590,7 @@ aes_gcm_dec_192_kernel: mov x19, v1.d[0] //AES block 4k+5 - mov low mov x6, v0.d[0] //AES block 4k+4 - mov low - stp x23, x24, [x2], #16 //AES block 4k+3 - store result + stp x23, x24, [PTR(2)], #16 //AES block 4k+3 - store result rev64 v5.16b, v5.16b //GHASH block 4k+5 aese v2.16b, v29.16b //AES block 4k+6 - round 11 @@ -3609,7 +3631,7 @@ aes_gcm_dec_192_kernel: #ifdef __AARCH64EB__ rev x7, x7 #endif - stp x6, x7, [x2], #16 //AES block 4k+4 - store result + stp x6, x7, [PTR(2)], #16 //AES block 4k+4 - store result eor v11.16b, v11.16b, v10.16b //MODULO - fold into low add w12, w12, #1 //CTR block 4k+10 @@ -3617,7 +3639,7 @@ aes_gcm_dec_192_kernel: orr x9, x11, x9, lsl #32 //CTR block 4k+10 aese v3.16b, v29.16b //AES block 4k+7 - round 11 - stp x19, x20, [x2], #16 //AES block 4k+5 - store result + stp x19, x20, [PTR(2)], #16 //AES block 4k+5 - store result b.lt .L192_dec_main_loop .L192_dec_prepretail: //PREPRETAIL @@ -3685,10 +3707,10 @@ aes_gcm_dec_192_kernel: #ifdef __AARCH64EB__ rev x23, x23 #endif - stp x21, x22, [x2], #16 //AES block 4k+2 - store result + stp x21, x22, [PTR(2)], #16 //AES block 4k+2 - store result rev64 v7.16b, v7.16b //GHASH block 4k+3 - stp x23, x24, [x2], #16 //AES block 4k+3 - store result + stp x23, x24, [PTR(2)], #16 //AES block 4k+3 - store result aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 0 @@ -3860,8 +3882,8 @@ aes_gcm_dec_192_kernel: eor v11.16b, v11.16b, v10.16b //MODULO - fold into low .L192_dec_tail: //TAIL - sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process - ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [PTR(0)], #16 //AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result @@ -3904,9 +3926,9 @@ aes_gcm_dec_192_kernel: b .L192_dec_blocks_less_than_1 .L192_dec_blocks_more_than_3: //blocks left > 3 rev64 v4.16b, v5.16b //GHASH final-3 block - ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final-2 block - load ciphertext - stp x6, x7, [x2], #16 //AES final-3 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-3 block - store result eor v4.16b, v4.16b, v8.16b //feed in partial tag @@ -3937,7 +3959,7 @@ aes_gcm_dec_192_kernel: .L192_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block - ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag @@ -3949,7 +3971,7 @@ aes_gcm_dec_192_kernel: pmull v21.1q, v4.1d, v14.1d //GHASH final-2 block - low - stp x6, x7, [x2], #16 //AES final-2 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-2 block - store result eor v22.8b, v22.8b, v4.8b //GHASH final-2 block - mid mov x7, v0.d[1] //AES final-1 block - mov high @@ -3976,14 +3998,14 @@ aes_gcm_dec_192_kernel: rev64 v4.16b, v5.16b //GHASH final-1 block eor v4.16b, v4.16b, v8.16b //feed in partial tag - ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final block - load ciphertext mov d22, v4.d[1] //GHASH final-1 block - mid pmull2 v20.1q, v4.2d, v13.2d //GHASH final-1 block - high eor v0.16b, v5.16b, v3.16b //AES final block - result - stp x6, x7, [x2], #16 //AES final-1 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-1 block - store result eor v22.8b, v22.8b, v4.8b //GHASH final-1 block - mid @@ -4011,7 +4033,7 @@ aes_gcm_dec_192_kernel: .L192_dec_blocks_less_than_1: //blocks left <= 1 mvn x13, xzr //rk12_l = 0xffffffffffffffff - ldp x4, x5, [x2] //load existing bytes we need to not overwrite + ldp x4, x5, [PTR(2)] //load existing bytes we need to not overwrite and x1, x1, #127 //bit_length %= 128 sub x1, x1, #128 //bit_length -= 128 @@ -4029,7 +4051,7 @@ aes_gcm_dec_192_kernel: fmov d0, x9 //ctr0b is mask for last block and x6, x6, x9 - bic x4, x4, x9 //mask out low existing bytes + bic x4, x4, x9 //mask out low existing bytes orr x6, x6, x4 mov v0.d[1], x10 @@ -4040,7 +4062,7 @@ aes_gcm_dec_192_kernel: #endif and v5.16b, v5.16b, v0.16b //possibly partial last block has zeroes in highest bits - str w9, [x16, #12] //store the updated counter + str w9, [PTR(16), #12] //store the updated counter rev64 v4.16b, v5.16b //GHASH final block @@ -4073,7 +4095,7 @@ aes_gcm_dec_192_kernel: pmull v31.1q, v9.1d, v8.1d //MODULO - top 64b align with mid orr x7, x7, x5 - stp x6, x7, [x2] + stp x6, x7, [PTR(2)] ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment @@ -4091,15 +4113,15 @@ aes_gcm_dec_192_kernel: ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 - st1 { v11.16b }, [x3] - - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + st1 { v11.16b }, [PTR(3)] + + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L192_dec_ret: @@ -4112,36 +4134,41 @@ aes_gcm_dec_192_kernel: aes_gcm_enc_256_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L256_enc_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] - - add x4, x0, x1, lsr #3 //end_input_ptr + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov PTR(16), PTR(4) + mov PTR(8), PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + +#ifdef __CHERI_PURE_CAPABILITY__ + lsr x4, x1, #3 + add PTR(4), PTR(0), x4 //end_input_ptr +#else + add PTR(4), PTR(0), x1, lsr #3 //end_input_ptr +#endif lsr x5, x1, #3 //byte_len mov x15, x5 - ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 + ldp x10, x11, [PTR(16)] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif - ldp x13, x14, [x8, #224] //load rk14 + ldp x13, x14, [PTR(8), #224] //load rk14 #ifdef __AARCH64EB__ ror x13, x13, #32 ror x14, x14, #32 #endif - ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + ld1 { v0.16b}, [PTR(16)] //special case vector load initial counter so we can start first AES block as quickly as possible sub x5, x5, #1 //byte_len - 1 - ld1 {v18.4s}, [x8], #16 //load rk0 + ld1 {v18.4s}, [PTR(8)], #16 //load rk0 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) - ld1 {v19.4s}, [x8], #16 //load rk1 + ld1 {v19.4s}, [PTR(8)], #16 //load rk1 add x5, x5, x0 lsr x12, x11, #32 @@ -4149,7 +4176,7 @@ aes_gcm_enc_256_kernel: orr w11, w11, w11 rev w12, w12 //rev_ctr32 - cmp x0, x5 //check if we have <= 4 blocks + cmp x0, x5 //check if we have <= 4 blocks fmov d1, x10 //CTR block 1 aese v0.16b, v18.16b @@ -4161,14 +4188,14 @@ aes_gcm_enc_256_kernel: orr x9, x11, x9, lsl #32 //CTR block 1 add w12, w12, #1 //CTR block 1 - ld1 {v20.4s}, [x8], #16 //load rk2 + ld1 {v20.4s}, [PTR(8)], #16 //load rk2 fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 add w12, w12, #1 //CTR block 2 orr x9, x11, x9, lsl #32 //CTR block 2 - ld1 {v21.4s}, [x8], #16 //load rk3 + ld1 {v21.4s}, [PTR(8)], #16 //load rk3 fmov v2.d[1], x9 //CTR block 2 rev w9, w12 //CTR block 3 @@ -4181,53 +4208,53 @@ aes_gcm_enc_256_kernel: aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ld1 {v22.4s}, [x8], #16 //load rk4 + ld1 {v22.4s}, [PTR(8)], #16 //load rk4 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 - ld1 {v23.4s}, [x8], #16 //load rk5 + ld1 {v23.4s}, [PTR(8)], #16 //load rk5 aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ld1 {v24.4s}, [x8], #16 //load rk6 + ld1 {v24.4s}, [PTR(8)], #16 //load rk6 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 - ldr q14, [x3, #80] //load h3l | h3h + ldr q14, [PTR(3), #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 - ld1 {v25.4s}, [x8], #16 //load rk7 + ld1 {v25.4s}, [PTR(8)], #16 //load rk7 aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 - ld1 {v26.4s}, [x8], #16 //load rk8 + ld1 {v26.4s}, [PTR(8)], #16 //load rk8 aese v1.16b, v20.16b aesmc v1.16b, v1.16b //AES block 1 - round 2 - ldr q13, [x3, #64] //load h2l | h2h + ldr q13, [PTR(3), #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 - ld1 {v27.4s}, [x8], #16 //load rk9 + ld1 {v27.4s}, [PTR(8)], #16 //load rk9 aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 - ldr q15, [x3, #112] //load h4l | h4h + ldr q15, [PTR(3), #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese v1.16b, v21.16b aesmc v1.16b, v1.16b //AES block 1 - round 3 - ld1 {v28.4s}, [x8], #16 //load rk10 + ld1 {v28.4s}, [PTR(8)], #16 //load rk10 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 - ld1 {v29.4s}, [x8], #16 //load rk11 + ld1 {v29.4s}, [PTR(8)], #16 //load rk11 aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 @@ -4238,7 +4265,7 @@ aes_gcm_enc_256_kernel: aese v3.16b, v21.16b aesmc v3.16b, v3.16b //AES block 3 - round 3 - ld1 { v11.16b}, [x3] + ld1 { v11.16b}, [PTR(3)] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b @@ -4272,17 +4299,17 @@ aes_gcm_enc_256_kernel: aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 3 - round 6 - ld1 {v30.4s}, [x8], #16 //load rk12 + ld1 {v30.4s}, [PTR(8)], #16 //load rk12 aese v0.16b, v24.16b aesmc v0.16b, v0.16b //AES block 0 - round 6 - ldr q12, [x3, #32] //load h1l | h1h + ldr q12, [PTR(3), #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v2.16b, v24.16b aesmc v2.16b, v2.16b //AES block 2 - round 6 - ld1 {v31.4s}, [x8], #16 //load rk13 + ld1 {v31.4s}, [PTR(8)], #16 //load rk13 aese v1.16b, v25.16b aesmc v1.16b, v1.16b //AES block 1 - round 7 @@ -4370,28 +4397,28 @@ aes_gcm_enc_256_kernel: eor v16.16b, v16.16b, v8.16b //h2k | h1k b.ge .L256_enc_tail //handle tail - ldp x19, x20, [x0, #16] //AES block 1 - load plaintext + ldp x19, x20, [PTR(0), #16] //AES block 1 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 #endif rev w9, w12 //CTR block 4 - ldp x6, x7, [x0, #0] //AES block 0 - load plaintext + ldp x6, x7, [PTR(0), #0] //AES block 0 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 #endif - ldp x23, x24, [x0, #48] //AES block 3 - load plaintext + ldp x23, x24, [PTR(0), #48] //AES block 3 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif - ldp x21, x22, [x0, #32] //AES block 2 - load plaintext + ldp x21, x22, [PTR(0), #32] //AES block 2 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 #endif - add x0, x0, #64 //AES input_ptr update + add PTR(0), PTR(0), #64 //AES input_ptr update eor x19, x19, x13 //AES block 1 - round 14 low eor x20, x20, x14 //AES block 1 - round 14 high @@ -4403,7 +4430,7 @@ aes_gcm_enc_256_kernel: eor x24, x24, x14 //AES block 3 - round 14 high fmov d4, x6 //AES block 0 - mov low - cmp x0, x5 //check if we have <= 8 blocks + cmp x0, x5 //check if we have <= 8 blocks fmov v4.d[1], x7 //AES block 0 - mov high eor x23, x23, x13 //AES block 3 - round 14 low @@ -4432,25 +4459,25 @@ aes_gcm_enc_256_kernel: fmov v1.d[1], x9 //CTR block 5 rev w9, w12 //CTR block 6 - st1 { v4.16b}, [x2], #16 //AES block 0 - store result + st1 { v4.16b}, [PTR(2)], #16 //AES block 0 - store result fmov v7.d[1], x24 //AES block 3 - mov high orr x9, x11, x9, lsl #32 //CTR block 6 eor v6.16b, v6.16b, v2.16b //AES block 2 - result - st1 { v5.16b}, [x2], #16 //AES block 1 - store result + st1 { v5.16b}, [PTR(2)], #16 //AES block 1 - store result add w12, w12, #1 //CTR block 6 fmov d2, x10 //CTR block 6 fmov v2.d[1], x9 //CTR block 6 - st1 { v6.16b}, [x2], #16 //AES block 2 - store result + st1 { v6.16b}, [PTR(2)], #16 //AES block 2 - store result rev w9, w12 //CTR block 7 orr x9, x11, x9, lsl #32 //CTR block 7 eor v7.16b, v7.16b, v3.16b //AES block 3 - result - st1 { v7.16b}, [x2], #16 //AES block 3 - store result + st1 { v7.16b}, [PTR(2)], #16 //AES block 3 - store result b.ge .L256_enc_prepretail //do prepretail .L256_enc_main_loop: //main loop start @@ -4472,14 +4499,14 @@ aes_gcm_enc_256_kernel: aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 1 - ldp x23, x24, [x0, #48] //AES block 4k+7 - load plaintext + ldp x23, x24, [PTR(0), #48] //AES block 4k+7 - load plaintext #ifdef __AARCH64EB__ rev x23, x23 rev x24, x24 #endif aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 1 - ldp x21, x22, [x0, #32] //AES block 4k+6 - load plaintext + ldp x21, x22, [PTR(0), #32] //AES block 4k+6 - load plaintext #ifdef __AARCH64EB__ rev x21, x21 rev x22, x22 @@ -4592,7 +4619,7 @@ aes_gcm_enc_256_kernel: aese v3.16b, v24.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 6 - ldp x19, x20, [x0, #16] //AES block 4k+5 - load plaintext + ldp x19, x20, [PTR(0), #16] //AES block 4k+5 - load plaintext #ifdef __AARCH64EB__ rev x19, x19 rev x20, x20 @@ -4632,7 +4659,7 @@ aes_gcm_enc_256_kernel: aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 8 - ldp x6, x7, [x0, #0] //AES block 4k+4 - load plaintext + ldp x6, x7, [PTR(0), #0] //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -4662,7 +4689,7 @@ aes_gcm_enc_256_kernel: aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 - add x0, x0, #64 //AES input_ptr update + add PTR(0), PTR(0), #64 //AES input_ptr update pmull v7.1q, v9.1d, v8.1d //MODULO - top 64b align with mid rev w9, w12 //CTR block 4k+8 @@ -4708,7 +4735,7 @@ aes_gcm_enc_256_kernel: fmov v5.d[1], x20 //AES block 4k+5 - mov high fmov d6, x21 //AES block 4k+6 - mov low - cmp x0, x5 //.LOOP CONTROL + cmp x0, x5 //.LOOP CONTROL fmov v6.d[1], x22 //AES block 4k+6 - mov high @@ -4730,21 +4757,21 @@ aes_gcm_enc_256_kernel: aese v2.16b, v31.16b //AES block 4k+6 - round 13 rev w9, w12 //CTR block 4k+10 - st1 { v4.16b}, [x2], #16 //AES block 4k+4 - store result + st1 { v4.16b}, [PTR(2)], #16 //AES block 4k+4 - store result orr x9, x11, x9, lsl #32 //CTR block 4k+10 eor v11.16b, v11.16b, v9.16b //MODULO - fold into low fmov v7.d[1], x24 //AES block 4k+7 - mov high ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment - st1 { v5.16b}, [x2], #16 //AES block 4k+5 - store result + st1 { v5.16b}, [PTR(2)], #16 //AES block 4k+5 - store result add w12, w12, #1 //CTR block 4k+10 aese v3.16b, v31.16b //AES block 4k+7 - round 13 eor v6.16b, v6.16b, v2.16b //AES block 4k+6 - result fmov d2, x10 //CTR block 4k+10 - st1 { v6.16b}, [x2], #16 //AES block 4k+6 - store result + st1 { v6.16b}, [PTR(2)], #16 //AES block 4k+6 - store result fmov v2.d[1], x9 //CTR block 4k+10 rev w9, w12 //CTR block 4k+11 @@ -4752,7 +4779,7 @@ aes_gcm_enc_256_kernel: orr x9, x11, x9, lsl #32 //CTR block 4k+11 eor v7.16b, v7.16b, v3.16b //AES block 4k+7 - result - st1 { v7.16b}, [x2], #16 //AES block 4k+7 - store result + st1 { v7.16b}, [PTR(2)], #16 //AES block 4k+7 - store result b.lt .L256_enc_main_loop .L256_enc_prepretail: //PREPRETAIL @@ -4996,8 +5023,8 @@ aes_gcm_enc_256_kernel: .L256_enc_tail: //TAIL ext v8.16b, v11.16b, v11.16b, #8 //prepare final partial tag - sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process - ldp x6, x7, [x0], #16 //AES block 4k+4 - load plaintext + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ldp x6, x7, [PTR(0)], #16 //AES block 4k+4 - load plaintext #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -5033,9 +5060,9 @@ aes_gcm_enc_256_kernel: sub w12, w12, #1 b .L256_enc_blocks_less_than_1 .L256_enc_blocks_more_than_3: //blocks left > 3 - st1 { v5.16b}, [x2], #16 //AES final-3 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-3 block - store result - ldp x6, x7, [x0], #16 //AES final-2 block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final-2 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -5065,9 +5092,9 @@ aes_gcm_enc_256_kernel: eor v5.16b, v5.16b, v1.16b //AES final-2 block - result .L256_enc_blocks_more_than_2: //blocks left > 2 - st1 { v5.16b}, [x2], #16 //AES final-2 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-2 block - store result - ldp x6, x7, [x0], #16 //AES final-1 block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final-1 block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -5102,11 +5129,11 @@ aes_gcm_enc_256_kernel: eor v10.16b, v10.16b, v22.16b //GHASH final-2 block - mid .L256_enc_blocks_more_than_1: //blocks left > 1 - st1 { v5.16b}, [x2], #16 //AES final-1 block - store result + st1 { v5.16b}, [PTR(2)], #16 //AES final-1 block - store result rev64 v4.16b, v5.16b //GHASH final-1 block - ldp x6, x7, [x0], #16 //AES final block - load input low & high + ldp x6, x7, [PTR(0)], #16 //AES final block - load input low & high #ifdef __AARCH64EB__ rev x6, x6 rev x7, x7 @@ -5146,7 +5173,7 @@ aes_gcm_enc_256_kernel: sub x1, x1, #128 //bit_length -= 128 neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) - ld1 { v18.16b}, [x2] //load existing bytes where the possibly partial last block is to be stored + ld1 { v18.16b}, [PTR(2)] //load existing bytes where the possibly partial last block is to be stored mvn x14, xzr //rk14_h = 0xffffffffffffffff and x1, x1, #127 //bit_length %= 128 @@ -5207,24 +5234,24 @@ aes_gcm_enc_256_kernel: ext v10.16b, v10.16b, v10.16b, #8 //MODULO - other mid alignment - str w9, [x16, #12] //store the updated counter + str w9, [PTR(16), #12] //store the updated counter - st1 { v5.16b}, [x2] //store all 16B + st1 { v5.16b}, [PTR(2)] //store all 16B eor v11.16b, v11.16b, v9.16b //MODULO - fold into low eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 - st1 { v11.16b }, [x3] - - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + st1 { v11.16b }, [PTR(3)] + + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L256_enc_ret: @@ -5237,42 +5264,47 @@ aes_gcm_enc_256_kernel: aes_gcm_dec_256_kernel: AARCH64_VALID_CALL_TARGET cbz x1, .L256_dec_ret - stp x19, x20, [sp, #-112]! - mov x16, x4 - mov x8, x5 - stp x21, x22, [sp, #16] - stp x23, x24, [sp, #32] - stp d8, d9, [sp, #48] - stp d10, d11, [sp, #64] - stp d12, d13, [sp, #80] - stp d14, d15, [sp, #96] + stp PTR(19), PTR(20), [PTRN(sp), #-(6*PTR_WIDTH+64)]! + mov PTR(16), PTR(4) + mov PTR(8), PTR(5) + stp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + stp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + stp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + stp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + stp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + stp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] lsr x5, x1, #3 //byte_len mov x15, x5 - ldp x10, x11, [x16] //ctr96_b64, ctr96_t32 + ldp x10, x11, [PTR(16)] //ctr96_b64, ctr96_t32 #ifdef __AARCH64EB__ rev x10, x10 rev x11, x11 #endif - ldp x13, x14, [x8, #224] //load rk14 + ldp x13, x14, [PTR(8), #224] //load rk14 #ifdef __AARCH64EB__ ror x14, x14, #32 ror x13, x13, #32 #endif - ld1 {v18.4s}, [x8], #16 //load rk0 + ld1 {v18.4s}, [PTR(8)], #16 //load rk0 sub x5, x5, #1 //byte_len - 1 - ld1 {v19.4s}, [x8], #16 //load rk1 + ld1 {v19.4s}, [PTR(8)], #16 //load rk1 and x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail) - add x4, x0, x1, lsr #3 //end_input_ptr - ld1 {v20.4s}, [x8], #16 //load rk2 +#ifdef __CHERI_PURE_CAPABILITY__ + lsr x4, x1, #3 + add PTR(4), PTR(0), x4 //end_input_ptr +#else + add PTR(4), PTR(0), x1, lsr #3 //end_input_ptr +#endif + ld1 {v20.4s}, [PTR(8)], #16 //load rk2 lsr x12, x11, #32 - ld1 {v21.4s}, [x8], #16 //load rk3 + ld1 {v21.4s}, [PTR(8)], #16 //load rk3 orr w11, w11, w11 - ld1 {v22.4s}, [x8], #16 //load rk4 + ld1 {v22.4s}, [PTR(8)], #16 //load rk4 add x5, x5, x0 rev w12, w12 //rev_ctr32 @@ -5284,7 +5316,7 @@ aes_gcm_dec_256_kernel: fmov d1, x10 //CTR block 1 orr x9, x11, x9, lsl #32 //CTR block 1 - ld1 { v0.16b}, [x16] //special case vector load initial counter so we can start first AES block as quickly as possible + ld1 { v0.16b}, [PTR(16)] //special case vector load initial counter so we can start first AES block as quickly as possible fmov v1.d[1], x9 //CTR block 1 rev w9, w12 //CTR block 2 @@ -5297,68 +5329,68 @@ aes_gcm_dec_256_kernel: rev w9, w12 //CTR block 3 orr x9, x11, x9, lsl #32 //CTR block 3 - ld1 {v23.4s}, [x8], #16 //load rk5 + ld1 {v23.4s}, [PTR(8)], #16 //load rk5 fmov v3.d[1], x9 //CTR block 3 add w12, w12, #1 //CTR block 3 - ld1 {v24.4s}, [x8], #16 //load rk6 + ld1 {v24.4s}, [PTR(8)], #16 //load rk6 - ld1 {v25.4s}, [x8], #16 //load rk7 + ld1 {v25.4s}, [PTR(8)], #16 //load rk7 - ld1 {v26.4s}, [x8], #16 //load rk8 + ld1 {v26.4s}, [PTR(8)], #16 //load rk8 aese v0.16b, v18.16b aesmc v0.16b, v0.16b //AES block 0 - round 0 - ldr q14, [x3, #80] //load h3l | h3h + ldr q14, [PTR(3), #80] //load h3l | h3h #ifndef __AARCH64EB__ ext v14.16b, v14.16b, v14.16b, #8 #endif aese v3.16b, v18.16b aesmc v3.16b, v3.16b //AES block 3 - round 0 - ldr q15, [x3, #112] //load h4l | h4h + ldr q15, [PTR(3), #112] //load h4l | h4h #ifndef __AARCH64EB__ ext v15.16b, v15.16b, v15.16b, #8 #endif aese v1.16b, v18.16b aesmc v1.16b, v1.16b //AES block 1 - round 0 - ldr q13, [x3, #64] //load h2l | h2h + ldr q13, [PTR(3), #64] //load h2l | h2h #ifndef __AARCH64EB__ ext v13.16b, v13.16b, v13.16b, #8 #endif aese v2.16b, v18.16b aesmc v2.16b, v2.16b //AES block 2 - round 0 - ld1 {v27.4s}, [x8], #16 //load rk9 + ld1 {v27.4s}, [PTR(8)], #16 //load rk9 aese v0.16b, v19.16b aesmc v0.16b, v0.16b //AES block 0 - round 1 aese v1.16b, v19.16b aesmc v1.16b, v1.16b //AES block 1 - round 1 - ld1 { v11.16b}, [x3] + ld1 { v11.16b}, [PTR(3)] ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b aese v2.16b, v19.16b aesmc v2.16b, v2.16b //AES block 2 - round 1 - ld1 {v28.4s}, [x8], #16 //load rk10 + ld1 {v28.4s}, [PTR(8)], #16 //load rk10 aese v3.16b, v19.16b aesmc v3.16b, v3.16b //AES block 3 - round 1 - ld1 {v29.4s}, [x8], #16 //load rk11 + ld1 {v29.4s}, [PTR(8)], #16 //load rk11 aese v0.16b, v20.16b aesmc v0.16b, v0.16b //AES block 0 - round 2 - ldr q12, [x3, #32] //load h1l | h1h + ldr q12, [PTR(3), #32] //load h1l | h1h #ifndef __AARCH64EB__ ext v12.16b, v12.16b, v12.16b, #8 #endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 2 - round 2 - ld1 {v30.4s}, [x8], #16 //load rk12 + ld1 {v30.4s}, [PTR(8)], #16 //load rk12 aese v3.16b, v20.16b aesmc v3.16b, v3.16b //AES block 3 - round 2 @@ -5374,7 +5406,7 @@ aes_gcm_dec_256_kernel: aese v0.16b, v22.16b aesmc v0.16b, v0.16b //AES block 0 - round 4 - cmp x0, x5 //check if we have <= 4 blocks + cmp x0, x5 //check if we have <= 4 blocks aese v2.16b, v21.16b aesmc v2.16b, v2.16b //AES block 2 - round 3 @@ -5441,7 +5473,7 @@ aes_gcm_dec_256_kernel: aese v2.16b, v26.16b aesmc v2.16b, v2.16b //AES block 2 - round 8 - ld1 {v31.4s}, [x8], #16 //load rk13 + ld1 {v31.4s}, [PTR(8)], #16 //load rk13 aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 1 - round 9 @@ -5506,7 +5538,7 @@ aes_gcm_dec_256_kernel: aese v0.16b, v31.16b //AES block 0 - round 13 b.ge .L256_dec_tail //handle tail - ld1 {v4.16b, v5.16b}, [x0], #32 //AES block 0,1 - load ciphertext + ld1 {v4.16b, v5.16b}, [PTR(0)], #32 //AES block 0,1 - load ciphertext rev w9, w12 //CTR block 4 @@ -5514,7 +5546,7 @@ aes_gcm_dec_256_kernel: eor v1.16b, v5.16b, v1.16b //AES block 1 - result rev64 v5.16b, v5.16b //GHASH block 1 - ld1 {v6.16b}, [x0], #16 //AES block 2 - load ciphertext + ld1 {v6.16b}, [PTR(0)], #16 //AES block 2 - load ciphertext mov x7, v0.d[1] //AES block 0 - mov high @@ -5541,10 +5573,10 @@ aes_gcm_dec_256_kernel: #ifdef __AARCH64EB__ rev x6, x6 #endif - stp x6, x7, [x2], #16 //AES block 0 - store result + stp x6, x7, [PTR(2)], #16 //AES block 0 - store result fmov d1, x10 //CTR block 5 - ld1 {v7.16b}, [x0], #16 //AES block 3 - load ciphertext + ld1 {v7.16b}, [PTR(0)], #16 //AES block 3 - load ciphertext fmov v1.d[1], x9 //CTR block 5 rev w9, w12 //CTR block 6 @@ -5560,10 +5592,10 @@ aes_gcm_dec_256_kernel: #ifdef __AARCH64EB__ rev x20, x20 #endif - stp x19, x20, [x2], #16 //AES block 1 - store result + stp x19, x20, [PTR(2)], #16 //AES block 1 - store result eor v2.16b, v6.16b, v2.16b //AES block 2 - result - cmp x0, x5 //check if we have <= 8 blocks + cmp x0, x5 //check if we have <= 8 blocks b.ge .L256_dec_prepretail //do prepretail .L256_dec_main_loop: //main loop start @@ -5629,7 +5661,7 @@ aes_gcm_dec_256_kernel: #endif aese v2.16b, v20.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 2 - stp x21, x22, [x2], #16 //AES block 4k+2 - store result + stp x21, x22, [PTR(2)], #16 //AES block 4k+2 - store result pmull v11.1q, v4.1d, v15.1d //GHASH block 4k - low @@ -5769,7 +5801,7 @@ aes_gcm_dec_256_kernel: aese v1.16b, v27.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 9 - ld1 {v4.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + ld1 {v4.16b}, [PTR(0)], #16 //AES block 4k+4 - load ciphertext aese v0.16b, v31.16b //AES block 4k+4 - round 13 ext v9.16b, v9.16b, v9.16b, #8 //MODULO - other top alignment @@ -5780,7 +5812,7 @@ aes_gcm_dec_256_kernel: aese v2.16b, v27.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 9 - ld1 {v5.16b}, [x0], #16 //AES block 4k+5 - load ciphertext + ld1 {v5.16b}, [PTR(0)], #16 //AES block 4k+5 - load ciphertext aese v3.16b, v26.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 8 @@ -5788,7 +5820,7 @@ aes_gcm_dec_256_kernel: aese v1.16b, v29.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 11 - stp x23, x24, [x2], #16 //AES block 4k+3 - store result + stp x23, x24, [PTR(2)], #16 //AES block 4k+3 - store result aese v2.16b, v28.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 10 @@ -5796,11 +5828,11 @@ aes_gcm_dec_256_kernel: aese v3.16b, v27.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 9 - ld1 {v6.16b}, [x0], #16 //AES block 4k+6 - load ciphertext + ld1 {v6.16b}, [PTR(0)], #16 //AES block 4k+6 - load ciphertext aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 - ld1 {v7.16b}, [x0], #16 //AES block 4k+7 - load ciphertext + ld1 {v7.16b}, [PTR(0)], #16 //AES block 4k+7 - load ciphertext aese v2.16b, v29.16b aesmc v2.16b, v2.16b //AES block 4k+6 - round 11 @@ -5827,7 +5859,7 @@ aes_gcm_dec_256_kernel: aese v2.16b, v31.16b //AES block 4k+6 - round 13 orr x9, x11, x9, lsl #32 //CTR block 4k+9 - cmp x0, x5 //.LOOP CONTROL + cmp x0, x5 //.LOOP CONTROL add w12, w12, #1 //CTR block 4k+9 @@ -5862,13 +5894,13 @@ aes_gcm_dec_256_kernel: #ifdef __AARCH64EB__ rev x20, x20 #endif - stp x6, x7, [x2], #16 //AES block 4k+4 - store result + stp x6, x7, [PTR(2)], #16 //AES block 4k+4 - store result eor x19, x19, x13 //AES block 4k+5 - round 14 low #ifdef __AARCH64EB__ rev x19, x19 #endif - stp x19, x20, [x2], #16 //AES block 4k+5 - store result + stp x19, x20, [PTR(2)], #16 //AES block 4k+5 - store result rev64 v4.16b, v4.16b //GHASH block 4k+4 eor v11.16b, v11.16b, v10.16b //MODULO - fold into low @@ -6109,7 +6141,7 @@ aes_gcm_dec_256_kernel: aese v3.16b, v29.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 11 - stp x21, x22, [x2], #16 //AES block 4k+2 - store result + stp x21, x22, [PTR(2)], #16 //AES block 4k+2 - store result aese v1.16b, v30.16b aesmc v1.16b, v1.16b //AES block 4k+5 - round 12 @@ -6117,7 +6149,7 @@ aes_gcm_dec_256_kernel: aese v0.16b, v30.16b aesmc v0.16b, v0.16b //AES block 4k+4 - round 12 - stp x23, x24, [x2], #16 //AES block 4k+3 - store result + stp x23, x24, [PTR(2)], #16 //AES block 4k+3 - store result aese v3.16b, v30.16b aesmc v3.16b, v3.16b //AES block 4k+7 - round 12 @@ -6133,8 +6165,8 @@ aes_gcm_dec_256_kernel: eor v11.16b, v11.16b, v10.16b //MODULO - fold into low .L256_dec_tail: //TAIL - sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process - ld1 { v5.16b}, [x0], #16 //AES block 4k+4 - load ciphertext + sub x5, x4, x0 //main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [PTR(0)], #16 //AES block 4k+4 - load ciphertext eor v0.16b, v5.16b, v0.16b //AES block 4k+4 - result @@ -6177,9 +6209,9 @@ aes_gcm_dec_256_kernel: b .L256_dec_blocks_less_than_1 .L256_dec_blocks_more_than_3: //blocks left > 3 rev64 v4.16b, v5.16b //GHASH final-3 block - ld1 { v5.16b}, [x0], #16 //AES final-2 block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final-2 block - load ciphertext - stp x6, x7, [x2], #16 //AES final-3 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-3 block - store result mov d10, v17.d[1] //GHASH final-3 block - mid @@ -6213,10 +6245,10 @@ aes_gcm_dec_256_kernel: .L256_dec_blocks_more_than_2: //blocks left > 2 rev64 v4.16b, v5.16b //GHASH final-2 block - ld1 { v5.16b}, [x0], #16 //AES final-1 block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final-1 block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag - stp x6, x7, [x2], #16 //AES final-2 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-2 block - store result eor v0.16b, v5.16b, v2.16b //AES final-1 block - result @@ -6248,10 +6280,10 @@ aes_gcm_dec_256_kernel: #endif .L256_dec_blocks_more_than_1: //blocks left > 1 - stp x6, x7, [x2], #16 //AES final-1 block - store result + stp x6, x7, [PTR(2)], #16 //AES final-1 block - store result rev64 v4.16b, v5.16b //GHASH final-1 block - ld1 { v5.16b}, [x0], #16 //AES final block - load ciphertext + ld1 { v5.16b}, [PTR(0)], #16 //AES final block - load ciphertext eor v4.16b, v4.16b, v8.16b //feed in partial tag movi v8.8b, #0 //suppress further partial tag feed in @@ -6293,7 +6325,7 @@ aes_gcm_dec_256_kernel: sub x1, x1, #128 //bit_length -= 128 mvn x13, xzr //rk14_l = 0xffffffffffffffff - ldp x4, x5, [x2] //load existing bytes we need to not overwrite + ldp x4, x5, [PTR(2)] //load existing bytes we need to not overwrite neg x1, x1 //bit_length = 128 - #bits in input (in range [1,128]) and x1, x1, #127 //bit_length %= 128 @@ -6308,7 +6340,7 @@ aes_gcm_dec_256_kernel: and x6, x6, x9 mov v0.d[1], x10 - bic x4, x4, x9 //mask out low existing bytes + bic x4, x4, x9 //mask out low existing bytes #ifndef __AARCH64EB__ rev w9, w12 @@ -6367,23 +6399,23 @@ aes_gcm_dec_256_kernel: eor v11.16b, v11.16b, v8.16b //MODULO - fold into low - stp x6, x7, [x2] + stp x6, x7, [PTR(2)] - str w9, [x16, #12] //store the updated counter + str w9, [PTR(16), #12] //store the updated counter eor v11.16b, v11.16b, v10.16b //MODULO - fold into low ext v11.16b, v11.16b, v11.16b, #8 rev64 v11.16b, v11.16b mov x0, x15 - st1 { v11.16b }, [x3] - - ldp x21, x22, [sp, #16] - ldp x23, x24, [sp, #32] - ldp d8, d9, [sp, #48] - ldp d10, d11, [sp, #64] - ldp d12, d13, [sp, #80] - ldp d14, d15, [sp, #96] - ldp x19, x20, [sp], #112 + st1 { v11.16b }, [PTR(3)] + + ldp PTR(21), PTR(22), [PTRN(sp), #(2*PTR_WIDTH)] + ldp PTR(23), PTR(24), [PTRN(sp), #(4*PTR_WIDTH)] + ldp d8, d9, [PTRN(sp), #(6*PTR_WIDTH)] + ldp d10, d11, [PTRN(sp), #(6*PTR_WIDTH+16)] + ldp d12, d13, [PTRN(sp), #(6*PTR_WIDTH+32)] + ldp d14, d15, [PTRN(sp), #(6*PTR_WIDTH+48)] + ldp PTR(19), PTR(20), [PTRN(sp)], #(6*PTR_WIDTH+64) ret .L256_dec_ret: diff --git a/sys/crypto/openssl/aarch64/aesv8-armx.S b/sys/crypto/openssl/aarch64/aesv8-armx.S index 015c2eea6dbb..13b70a4e8453 100644 --- a/sys/crypto/openssl/aarch64/aesv8-armx.S +++ b/sys/crypto/openssl/aarch64/aesv8-armx.S @@ -2,13 +2,17 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 +#ifndef __CHERI_PURE_CAPABILITY__ .arch armv8-a+crypto +#endif .text .align 5 +.type .Lrcon,%object .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b +.size .Lrcon, . - .Lrcon .globl aes_v8_set_encrypt_key .type aes_v8_set_encrypt_key,%function @@ -17,8 +21,8 @@ aes_v8_set_encrypt_key: .Lenc_key: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 mov x3,#-1 cmp x0,#0 b.eq .Lenc_key_abort @@ -32,13 +36,13 @@ aes_v8_set_encrypt_key: tst w1,#0x3f b.ne .Lenc_key_abort - adr x3,.Lrcon + adr PTR(3),.Lrcon cmp w1,#192 eor v0.16b,v0.16b,v0.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 mov w1,#8 // reuse w1 - ld1 {v1.4s,v2.4s},[x3],#32 + ld1 {v1.4s,v2.4s},[PTR(3)],#32 b.lt .Loop128 b.eq .L192 @@ -48,7 +52,7 @@ aes_v8_set_encrypt_key: .Loop128: tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 + st1 {v3.4s},[PTR(2)],#16 aese v6.16b,v0.16b subs w1,w1,#1 @@ -62,11 +66,11 @@ aes_v8_set_encrypt_key: eor v3.16b,v3.16b,v6.16b b.ne .Loop128 - ld1 {v1.4s},[x3] + ld1 {v1.4s},[PTR(3)] tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 + st1 {v3.4s},[PTR(2)],#16 aese v6.16b,v0.16b eor v3.16b,v3.16b,v5.16b @@ -80,7 +84,7 @@ aes_v8_set_encrypt_key: tbl v6.16b,{v3.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 + st1 {v3.4s},[PTR(2)],#16 aese v6.16b,v0.16b eor v3.16b,v3.16b,v5.16b @@ -90,27 +94,27 @@ aes_v8_set_encrypt_key: eor v6.16b,v6.16b,v1.16b eor v3.16b,v3.16b,v5.16b eor v3.16b,v3.16b,v6.16b - st1 {v3.4s},[x2] - add x2,x2,#0x50 + st1 {v3.4s},[PTR(2)] + add PTR(2),PTR(2),#0x50 mov w12,#10 b .Ldone .align 4 .L192: - ld1 {v4.8b},[x0],#8 + ld1 {v4.8b},[PTR(0)],#8 movi v6.16b,#8 // borrow v6.16b - st1 {v3.4s},[x2],#16 + st1 {v3.4s},[PTR(2)],#16 sub v2.16b,v2.16b,v6.16b // adjust the mask .Loop192: tbl v6.16b,{v4.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 #ifdef __AARCH64EB__ - st1 {v4.4s},[x2],#16 - sub x2,x2,#8 + st1 {v4.4s},[PTR(2)],#16 + sub PTR(2),PTR(2),#8 #else - st1 {v4.8b},[x2],#8 + st1 {v4.8b},[PTR(2)],#8 #endif aese v6.16b,v0.16b subs w1,w1,#1 @@ -129,24 +133,24 @@ aes_v8_set_encrypt_key: eor v4.16b,v4.16b,v5.16b eor v3.16b,v3.16b,v6.16b eor v4.16b,v4.16b,v6.16b - st1 {v3.4s},[x2],#16 + st1 {v3.4s},[PTR(2)],#16 b.ne .Loop192 mov w12,#12 - add x2,x2,#0x20 + add PTR(2),PTR(2),#0x20 b .Ldone .align 4 .L256: - ld1 {v4.16b},[x0] + ld1 {v4.16b},[PTR(0)] mov w1,#7 mov w12,#14 - st1 {v3.4s},[x2],#16 + st1 {v3.4s},[PTR(2)],#16 .Loop256: tbl v6.16b,{v4.16b},v2.16b ext v5.16b,v0.16b,v3.16b,#12 - st1 {v4.4s},[x2],#16 + st1 {v4.4s},[PTR(2)],#16 aese v6.16b,v0.16b subs w1,w1,#1 @@ -158,7 +162,7 @@ aes_v8_set_encrypt_key: eor v3.16b,v3.16b,v5.16b shl v1.16b,v1.16b,#1 eor v3.16b,v3.16b,v6.16b - st1 {v3.4s},[x2],#16 + st1 {v3.4s},[PTR(2)],#16 b.eq .Ldone dup v6.4s,v3.s[3] // just splat @@ -175,12 +179,12 @@ aes_v8_set_encrypt_key: b .Loop256 .Ldone: - str w12,[x2] + str w12,[PTR(2)] mov x3,#0 .Lenc_key_abort: - mov x0,x3 // return value - ldr x29,[sp],#16 + mov x0,x3 // return value + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key @@ -189,39 +193,39 @@ aes_v8_set_encrypt_key: .align 5 aes_v8_set_decrypt_key: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 bl .Lenc_key cmp x0,#0 b.ne .Ldec_key_abort - sub x2,x2,#240 // restore original x2 + sub PTR(2),PTR(2),#240 // restore original PTR(2) mov x4,#-16 - add x0,x2,x12,lsl#4 // end of key schedule + add PTR(0),PTR(2),x12,lsl#4 // end of key schedule - ld1 {v0.4s},[x2] - ld1 {v1.4s},[x0] - st1 {v0.4s},[x0],x4 - st1 {v1.4s},[x2],#16 + ld1 {v0.4s},[PTR(2)] + ld1 {v1.4s},[PTR(0)] + st1 {v0.4s},[PTR(0)],x4 + st1 {v1.4s},[PTR(2)],#16 .Loop_imc: - ld1 {v0.4s},[x2] - ld1 {v1.4s},[x0] + ld1 {v0.4s},[PTR(2)] + ld1 {v1.4s},[PTR(0)] aesimc v0.16b,v0.16b aesimc v1.16b,v1.16b - st1 {v0.4s},[x0],x4 - st1 {v1.4s},[x2],#16 - cmp x0,x2 + st1 {v0.4s},[PTR(0)],x4 + st1 {v1.4s},[PTR(2)],#16 + cmp PTR(0),PTR(2) b.hi .Loop_imc - ld1 {v0.4s},[x2] + ld1 {v0.4s},[PTR(2)] aesimc v0.16b,v0.16b - st1 {v0.4s},[x0] + st1 {v0.4s},[PTR(0)] eor x0,x0,x0 // return value .Ldec_key_abort: - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key @@ -230,29 +234,29 @@ aes_v8_set_decrypt_key: .align 5 aes_v8_encrypt: AARCH64_VALID_CALL_TARGET - ldr w3,[x2,#240] - ld1 {v0.4s},[x2],#16 - ld1 {v2.16b},[x0] + ldr w3,[PTR(2),#240] + ld1 {v0.4s},[PTR(2)],#16 + ld1 {v2.16b},[PTR(0)] sub w3,w3,#2 - ld1 {v1.4s},[x2],#16 + ld1 {v1.4s},[PTR(2)],#16 .Loop_enc: aese v2.16b,v0.16b aesmc v2.16b,v2.16b - ld1 {v0.4s},[x2],#16 + ld1 {v0.4s},[PTR(2)],#16 subs w3,w3,#2 aese v2.16b,v1.16b aesmc v2.16b,v2.16b - ld1 {v1.4s},[x2],#16 + ld1 {v1.4s},[PTR(2)],#16 b.gt .Loop_enc aese v2.16b,v0.16b aesmc v2.16b,v2.16b - ld1 {v0.4s},[x2] + ld1 {v0.4s},[PTR(2)] aese v2.16b,v1.16b eor v2.16b,v2.16b,v0.16b - st1 {v2.16b},[x1] + st1 {v2.16b},[PTR(1)] ret .size aes_v8_encrypt,.-aes_v8_encrypt .globl aes_v8_decrypt @@ -260,29 +264,29 @@ aes_v8_encrypt: .align 5 aes_v8_decrypt: AARCH64_VALID_CALL_TARGET - ldr w3,[x2,#240] - ld1 {v0.4s},[x2],#16 - ld1 {v2.16b},[x0] + ldr w3,[PTR(2),#240] + ld1 {v0.4s},[PTR(2)],#16 + ld1 {v2.16b},[PTR(0)] sub w3,w3,#2 - ld1 {v1.4s},[x2],#16 + ld1 {v1.4s},[PTR(2)],#16 .Loop_dec: aesd v2.16b,v0.16b aesimc v2.16b,v2.16b - ld1 {v0.4s},[x2],#16 + ld1 {v0.4s},[PTR(2)],#16 subs w3,w3,#2 aesd v2.16b,v1.16b aesimc v2.16b,v2.16b - ld1 {v1.4s},[x2],#16 + ld1 {v1.4s},[PTR(2)],#16 b.gt .Loop_dec aesd v2.16b,v0.16b aesimc v2.16b,v2.16b - ld1 {v0.4s},[x2] + ld1 {v0.4s},[PTR(2)] aesd v2.16b,v1.16b eor v2.16b,v2.16b,v0.16b - st1 {v2.16b},[x1] + st1 {v2.16b},[PTR(1)] ret .size aes_v8_decrypt,.-aes_v8_decrypt .globl aes_v8_ecb_encrypt @@ -293,15 +297,15 @@ aes_v8_ecb_encrypt: subs x2,x2,#16 // Original input data size bigger than 16, jump to big size processing. b.ne .Lecb_big_size - ld1 {v0.16b},[x0] + ld1 {v0.16b},[PTR(0)] cmp w4,#0 // en- or decrypting? - ldr w5,[x3,#240] - ld1 {v5.4s,v6.4s},[x3],#32 // load key schedule... + ldr w5,[PTR(3),#240] + ld1 {v5.4s,v6.4s},[PTR(3)],#32 // load key schedule... b.eq .Lecb_small_dec aese v0.16b,v5.16b aesmc v0.16b,v0.16b - ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v6.16b aesmc v0.16b,v0.16b subs w5,w5,#10 // if rounds==10, jump to aes-128-ecb processing @@ -309,39 +313,39 @@ aes_v8_ecb_encrypt: .Lecb_round_loop: aese v0.16b,v16.16b aesmc v0.16b,v0.16b - ld1 {v16.4s},[x3],#16 // load key schedule... + ld1 {v16.4s},[PTR(3)],#16 // load key schedule... aese v0.16b,v17.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x3],#16 // load key schedule... + ld1 {v17.4s},[PTR(3)],#16 // load key schedule... subs w5,w5,#2 // bias b.gt .Lecb_round_loop .Lecb_128_enc: - ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... + ld1 {v18.4s,v19.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v0.16b,v17.16b aesmc v0.16b,v0.16b - ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... + ld1 {v20.4s,v21.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v18.16b aesmc v0.16b,v0.16b aese v0.16b,v19.16b aesmc v0.16b,v0.16b - ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... + ld1 {v22.4s,v23.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b - ld1 {v7.4s},[x3] + ld1 {v7.4s},[PTR(3)] aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v0.16b,v23.16b eor v0.16b,v0.16b,v7.16b - st1 {v0.16b},[x1] + st1 {v0.16b},[PTR(1)] b .Lecb_Final_abort .Lecb_small_dec: aesd v0.16b,v5.16b aesimc v0.16b,v0.16b - ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v6.16b aesimc v0.16b,v0.16b subs w5,w5,#10 // bias @@ -349,61 +353,61 @@ aes_v8_ecb_encrypt: .Lecb_dec_round_loop: aesd v0.16b,v16.16b aesimc v0.16b,v0.16b - ld1 {v16.4s},[x3],#16 // load key schedule... + ld1 {v16.4s},[PTR(3)],#16 // load key schedule... aesd v0.16b,v17.16b aesimc v0.16b,v0.16b - ld1 {v17.4s},[x3],#16 // load key schedule... + ld1 {v17.4s},[PTR(3)],#16 // load key schedule... subs w5,w5,#2 // bias b.gt .Lecb_dec_round_loop .Lecb_128_dec: - ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... + ld1 {v18.4s,v19.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v16.16b aesimc v0.16b,v0.16b aesd v0.16b,v17.16b aesimc v0.16b,v0.16b - ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... + ld1 {v20.4s,v21.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v18.16b aesimc v0.16b,v0.16b aesd v0.16b,v19.16b aesimc v0.16b,v0.16b - ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... + ld1 {v22.4s,v23.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v20.16b aesimc v0.16b,v0.16b aesd v0.16b,v21.16b aesimc v0.16b,v0.16b - ld1 {v7.4s},[x3] + ld1 {v7.4s},[PTR(3)] aesd v0.16b,v22.16b aesimc v0.16b,v0.16b aesd v0.16b,v23.16b eor v0.16b,v0.16b,v7.16b - st1 {v0.16b},[x1] + st1 {v0.16b},[PTR(1)] b .Lecb_Final_abort .Lecb_big_size: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 mov x8,#16 b.lo .Lecb_done csel x8,xzr,x8,eq cmp w4,#0 // en- or decrypting? - ldr w5,[x3,#240] + ldr w5,[PTR(3),#240] and x2,x2,#-16 - ld1 {v0.16b},[x0],x8 + ld1 {v0.16b},[PTR(0)],x8 - ld1 {v16.4s,v17.4s},[x3] // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)] // load key schedule... sub w5,w5,#6 - add x7,x3,x5,lsl#4 // pointer to last 7 round keys + add PTR(7),PTR(3),x5,lsl#4 // pointer to last 7 round keys sub w5,w5,#2 - ld1 {v18.4s,v19.4s},[x7],#32 - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] + ld1 {v18.4s,v19.4s},[PTR(7)],#32 + ld1 {v20.4s,v21.4s},[PTR(7)],#32 + ld1 {v22.4s,v23.4s},[PTR(7)],#32 + ld1 {v7.4s},[PTR(7)] - add x7,x3,#32 + add PTR(7),PTR(3),#32 mov w6,w5 b.eq .Lecb_dec - ld1 {v1.16b},[x0],#16 + ld1 {v1.16b},[PTR(0)],#16 subs x2,x2,#32 // bias add w6,w5,#2 orr v3.16b,v1.16b,v1.16b @@ -412,12 +416,12 @@ aes_v8_ecb_encrypt: b.lo .Lecb_enc_tail orr v1.16b,v3.16b,v3.16b - ld1 {v24.16b},[x0],#16 + ld1 {v24.16b},[PTR(0)],#16 cmp x2,#32 b.lo .Loop3x_ecb_enc - ld1 {v25.16b},[x0],#16 - ld1 {v26.16b},[x0],#16 + ld1 {v25.16b},[PTR(0)],#16 + ld1 {v26.16b},[PTR(0)],#16 sub x2,x2,#32 // bias mov w6,w5 @@ -432,7 +436,7 @@ aes_v8_ecb_encrypt: aesmc v25.16b,v25.16b aese v26.16b,v16.16b aesmc v26.16b,v26.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -444,7 +448,7 @@ aes_v8_ecb_encrypt: aesmc v25.16b,v25.16b aese v26.16b,v17.16b aesmc v26.16b,v26.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop5x_ecb_enc aese v0.16b,v16.16b @@ -471,7 +475,7 @@ aes_v8_ecb_encrypt: aese v26.16b,v17.16b aesmc v26.16b,v26.16b csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo - mov x7,x3 + mov PTR(7),PTR(3) aese v0.16b,v18.16b aesmc v0.16b,v0.16b @@ -483,7 +487,7 @@ aes_v8_ecb_encrypt: aesmc v25.16b,v25.16b aese v26.16b,v18.16b aesmc v26.16b,v26.16b - add x0,x0,x6 // x0 is adjusted in such way that + add PTR(0),PTR(0),x6 // PTR(0) is adjusted in such way that // at exit from the loop v1.16b-v26.16b // are loaded with last "words" add x6,x2,#0x60 // because .Lecb_enc_tail4x @@ -533,17 +537,17 @@ aes_v8_ecb_encrypt: aesmc v26.16b,v26.16b aese v0.16b,v23.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aese v1.16b,v23.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aese v24.16b,v23.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aese v25.16b,v23.16b - ld1 {v28.16b},[x0],#16 + ld1 {v28.16b},[PTR(0)],#16 aese v26.16b,v23.16b - ld1 {v29.16b},[x0],#16 + ld1 {v29.16b},[PTR(0)],#16 cbz x6,.Lecb_enc_tail4x - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] eor v4.16b,v7.16b,v0.16b orr v0.16b,v2.16b,v2.16b eor v5.16b,v7.16b,v1.16b @@ -553,14 +557,14 @@ aes_v8_ecb_encrypt: eor v30.16b,v7.16b,v25.16b orr v25.16b,v28.16b,v28.16b eor v31.16b,v7.16b,v26.16b - st1 {v4.16b},[x1],#16 + st1 {v4.16b},[PTR(1)],#16 orr v26.16b,v29.16b,v29.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 mov w6,w5 - st1 {v17.16b},[x1],#16 - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v30.16b},[x1],#16 - st1 {v31.16b},[x1],#16 + st1 {v17.16b},[PTR(1)],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[PTR(1)],#16 + st1 {v31.16b},[PTR(1)],#16 b.hs .Loop5x_ecb_enc add x2,x2,#0x50 @@ -581,10 +585,10 @@ aes_v8_ecb_encrypt: eor v17.16b,v7.16b,v24.16b eor v30.16b,v7.16b,v25.16b eor v31.16b,v7.16b,v26.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 - st1 {v30.16b},[x1],#16 - st1 {v31.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v17.16b},[PTR(1)],#16 + st1 {v30.16b},[PTR(1)],#16 + st1 {v31.16b},[PTR(1)],#16 b .Lecb_done .align 4 @@ -595,7 +599,7 @@ aes_v8_ecb_encrypt: aesmc v1.16b,v1.16b aese v24.16b,v16.16b aesmc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -603,7 +607,7 @@ aes_v8_ecb_encrypt: aesmc v1.16b,v1.16b aese v24.16b,v17.16b aesmc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop3x_ecb_enc aese v0.16b,v16.16b @@ -620,45 +624,45 @@ aes_v8_ecb_encrypt: aesmc v1.16b,v1.16b aese v24.16b,v17.16b aesmc v24.16b,v24.16b - add x0,x0,x6 // x0 is adjusted in such way that + add PTR(0),PTR(0),x6 // PTR(0) is adjusted in such way that // at exit from the loop v1.16b-v24.16b // are loaded with last "words" - mov x7,x3 + mov PTR(7),PTR(3) aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v1.16b,v20.16b aesmc v1.16b,v1.16b aese v24.16b,v20.16b aesmc v24.16b,v24.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aese v0.16b,v21.16b aesmc v0.16b,v0.16b aese v1.16b,v21.16b aesmc v1.16b,v1.16b aese v24.16b,v21.16b aesmc v24.16b,v24.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v1.16b,v22.16b aesmc v1.16b,v1.16b aese v24.16b,v22.16b aesmc v24.16b,v24.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aese v0.16b,v23.16b aese v1.16b,v23.16b aese v24.16b,v23.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] add w6,w5,#2 eor v4.16b,v7.16b,v0.16b eor v5.16b,v7.16b,v1.16b eor v24.16b,v24.16b,v7.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v4.16b},[x1],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[PTR(1)],#16 orr v0.16b,v2.16b,v2.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 orr v1.16b,v3.16b,v3.16b - st1 {v24.16b},[x1],#16 + st1 {v24.16b},[PTR(1)],#16 orr v24.16b,v27.16b,v27.16b b.hs .Loop3x_ecb_enc @@ -671,13 +675,13 @@ aes_v8_ecb_encrypt: aesmc v1.16b,v1.16b aese v24.16b,v16.16b aesmc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v1.16b,v17.16b aesmc v1.16b,v1.16b aese v24.16b,v17.16b aesmc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Lecb_enc_tail aese v1.16b,v16.16b @@ -706,17 +710,17 @@ aes_v8_ecb_encrypt: b.eq .Lecb_enc_one eor v5.16b,v7.16b,v1.16b eor v17.16b,v7.16b,v24.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v17.16b},[PTR(1)],#16 b .Lecb_done .Lecb_enc_one: eor v5.16b,v7.16b,v24.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 b .Lecb_done .align 5 .Lecb_dec: - ld1 {v1.16b},[x0],#16 + ld1 {v1.16b},[PTR(0)],#16 subs x2,x2,#32 // bias add w6,w5,#2 orr v3.16b,v1.16b,v1.16b @@ -725,12 +729,12 @@ aes_v8_ecb_encrypt: b.lo .Lecb_dec_tail orr v1.16b,v3.16b,v3.16b - ld1 {v24.16b},[x0],#16 + ld1 {v24.16b},[PTR(0)],#16 cmp x2,#32 b.lo .Loop3x_ecb_dec - ld1 {v25.16b},[x0],#16 - ld1 {v26.16b},[x0],#16 + ld1 {v25.16b},[PTR(0)],#16 + ld1 {v26.16b},[PTR(0)],#16 sub x2,x2,#32 // bias mov w6,w5 @@ -745,7 +749,7 @@ aes_v8_ecb_encrypt: aesimc v25.16b,v25.16b aesd v26.16b,v16.16b aesimc v26.16b,v26.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aesd v0.16b,v17.16b aesimc v0.16b,v0.16b @@ -757,7 +761,7 @@ aes_v8_ecb_encrypt: aesimc v25.16b,v25.16b aesd v26.16b,v17.16b aesimc v26.16b,v26.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop5x_ecb_dec aesd v0.16b,v16.16b @@ -784,7 +788,7 @@ aes_v8_ecb_encrypt: aesd v26.16b,v17.16b aesimc v26.16b,v26.16b csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo - mov x7,x3 + mov PTR(7),PTR(3) aesd v0.16b,v18.16b aesimc v0.16b,v0.16b @@ -796,7 +800,7 @@ aes_v8_ecb_encrypt: aesimc v25.16b,v25.16b aesd v26.16b,v18.16b aesimc v26.16b,v26.16b - add x0,x0,x6 // x0 is adjusted in such way that + add PTR(0),PTR(0),x6 // PTR(0) is adjusted in such way that // at exit from the loop v1.16b-v26.16b // are loaded with last "words" add x6,x2,#0x60 // because .Lecb_tail4x @@ -846,17 +850,17 @@ aes_v8_ecb_encrypt: aesimc v26.16b,v26.16b aesd v0.16b,v23.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aesd v1.16b,v23.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aesd v24.16b,v23.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aesd v25.16b,v23.16b - ld1 {v28.16b},[x0],#16 + ld1 {v28.16b},[PTR(0)],#16 aesd v26.16b,v23.16b - ld1 {v29.16b},[x0],#16 + ld1 {v29.16b},[PTR(0)],#16 cbz x6,.Lecb_tail4x - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] eor v4.16b,v7.16b,v0.16b orr v0.16b,v2.16b,v2.16b eor v5.16b,v7.16b,v1.16b @@ -866,14 +870,14 @@ aes_v8_ecb_encrypt: eor v30.16b,v7.16b,v25.16b orr v25.16b,v28.16b,v28.16b eor v31.16b,v7.16b,v26.16b - st1 {v4.16b},[x1],#16 + st1 {v4.16b},[PTR(1)],#16 orr v26.16b,v29.16b,v29.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 mov w6,w5 - st1 {v17.16b},[x1],#16 - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v30.16b},[x1],#16 - st1 {v31.16b},[x1],#16 + st1 {v17.16b},[PTR(1)],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[PTR(1)],#16 + st1 {v31.16b},[PTR(1)],#16 b.hs .Loop5x_ecb_dec add x2,x2,#0x50 @@ -894,10 +898,10 @@ aes_v8_ecb_encrypt: eor v17.16b,v7.16b,v24.16b eor v30.16b,v7.16b,v25.16b eor v31.16b,v7.16b,v26.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 - st1 {v30.16b},[x1],#16 - st1 {v31.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v17.16b},[PTR(1)],#16 + st1 {v30.16b},[PTR(1)],#16 + st1 {v31.16b},[PTR(1)],#16 b .Lecb_done .align 4 @@ -908,7 +912,7 @@ aes_v8_ecb_encrypt: aesimc v1.16b,v1.16b aesd v24.16b,v16.16b aesimc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aesd v0.16b,v17.16b aesimc v0.16b,v0.16b @@ -916,7 +920,7 @@ aes_v8_ecb_encrypt: aesimc v1.16b,v1.16b aesd v24.16b,v17.16b aesimc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop3x_ecb_dec aesd v0.16b,v16.16b @@ -933,45 +937,45 @@ aes_v8_ecb_encrypt: aesimc v1.16b,v1.16b aesd v24.16b,v17.16b aesimc v24.16b,v24.16b - add x0,x0,x6 // x0 is adjusted in such way that + add PTR(0),PTR(0),x6 // PTR(0) is adjusted in such way that // at exit from the loop v1.16b-v24.16b // are loaded with last "words" - mov x7,x3 + mov PTR(7),PTR(3) aesd v0.16b,v20.16b aesimc v0.16b,v0.16b aesd v1.16b,v20.16b aesimc v1.16b,v1.16b aesd v24.16b,v20.16b aesimc v24.16b,v24.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aesd v0.16b,v21.16b aesimc v0.16b,v0.16b aesd v1.16b,v21.16b aesimc v1.16b,v1.16b aesd v24.16b,v21.16b aesimc v24.16b,v24.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aesd v0.16b,v22.16b aesimc v0.16b,v0.16b aesd v1.16b,v22.16b aesimc v1.16b,v1.16b aesd v24.16b,v22.16b aesimc v24.16b,v24.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aesd v0.16b,v23.16b aesd v1.16b,v23.16b aesd v24.16b,v23.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] add w6,w5,#2 eor v4.16b,v7.16b,v0.16b eor v5.16b,v7.16b,v1.16b eor v24.16b,v24.16b,v7.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v4.16b},[x1],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[PTR(1)],#16 orr v0.16b,v2.16b,v2.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 orr v1.16b,v3.16b,v3.16b - st1 {v24.16b},[x1],#16 + st1 {v24.16b},[PTR(1)],#16 orr v24.16b,v27.16b,v27.16b b.hs .Loop3x_ecb_dec @@ -984,13 +988,13 @@ aes_v8_ecb_encrypt: aesimc v1.16b,v1.16b aesd v24.16b,v16.16b aesimc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aesd v1.16b,v17.16b aesimc v1.16b,v1.16b aesd v24.16b,v17.16b aesimc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Lecb_dec_tail aesd v1.16b,v16.16b @@ -1019,16 +1023,16 @@ aes_v8_ecb_encrypt: b.eq .Lecb_dec_one eor v5.16b,v7.16b,v1.16b eor v17.16b,v7.16b,v24.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v17.16b},[PTR(1)],#16 b .Lecb_done .Lecb_dec_one: eor v5.16b,v7.16b,v24.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 .Lecb_done: - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) .Lecb_Final_abort: ret .size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt @@ -1038,29 +1042,29 @@ aes_v8_ecb_encrypt: aes_v8_cbc_encrypt: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 subs x2,x2,#16 mov x8,#16 b.lo .Lcbc_abort csel x8,xzr,x8,eq cmp w5,#0 // en- or decrypting? - ldr w5,[x3,#240] + ldr w5,[PTR(3),#240] and x2,x2,#-16 - ld1 {v6.16b},[x4] - ld1 {v0.16b},[x0],x8 + ld1 {v6.16b},[PTR(4)] + ld1 {v0.16b},[PTR(0)],x8 - ld1 {v16.4s,v17.4s},[x3] // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)] // load key schedule... sub w5,w5,#6 - add x7,x3,x5,lsl#4 // pointer to last 7 round keys + add PTR(7),PTR(3),x5,lsl#4 // pointer to last 7 round keys sub w5,w5,#2 - ld1 {v18.4s,v19.4s},[x7],#32 - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] + ld1 {v18.4s,v19.4s},[PTR(7)],#32 + ld1 {v20.4s,v21.4s},[PTR(7)],#32 + ld1 {v22.4s,v23.4s},[PTR(7)],#32 + ld1 {v7.4s},[PTR(7)] - add x7,x3,#32 + add PTR(7),PTR(3),#32 mov w6,w5 b.eq .Lcbc_dec @@ -1069,39 +1073,39 @@ aes_v8_cbc_encrypt: eor v5.16b,v16.16b,v7.16b b.eq .Lcbc_enc128 - ld1 {v2.4s,v3.4s},[x7] - add x7,x3,#16 - add x6,x3,#16*4 - add x12,x3,#16*5 + ld1 {v2.4s,v3.4s},[PTR(7)] + add PTR(7),PTR(3),#16 + add PTR(6),PTR(3),#16*4 + add PTR(12),PTR(3),#16*5 aese v0.16b,v16.16b aesmc v0.16b,v0.16b - add x14,x3,#16*6 - add x3,x3,#16*7 + add PTR(14),PTR(3),#16*6 + add PTR(3),PTR(3),#16*7 b .Lenter_cbc_enc .align 4 .Loop_cbc_enc: aese v0.16b,v16.16b aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 + st1 {v6.16b},[PTR(1)],#16 .Lenter_cbc_enc: aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v0.16b,v2.16b aesmc v0.16b,v0.16b - ld1 {v16.4s},[x6] + ld1 {v16.4s},[PTR(6)] cmp w5,#4 aese v0.16b,v3.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x12] + ld1 {v17.4s},[PTR(12)] b.eq .Lcbc_enc192 aese v0.16b,v16.16b aesmc v0.16b,v0.16b - ld1 {v16.4s},[x14] + ld1 {v16.4s},[PTR(14)] aese v0.16b,v17.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x3] + ld1 {v17.4s},[PTR(3)] nop .Lcbc_enc192: @@ -1115,32 +1119,32 @@ aes_v8_cbc_encrypt: aesmc v0.16b,v0.16b aese v0.16b,v19.16b aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 + ld1 {v16.16b},[PTR(0)],x8 aese v0.16b,v20.16b aesmc v0.16b,v0.16b eor v16.16b,v16.16b,v5.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x7] // re-pre-load rndkey[1] + ld1 {v17.4s},[PTR(7)] // re-pre-load rndkey[1] aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v0.16b,v23.16b eor v6.16b,v0.16b,v7.16b b.hs .Loop_cbc_enc - st1 {v6.16b},[x1],#16 + st1 {v6.16b},[PTR(1)],#16 b .Lcbc_done .align 5 .Lcbc_enc128: - ld1 {v2.4s,v3.4s},[x7] + ld1 {v2.4s,v3.4s},[PTR(7)] aese v0.16b,v16.16b aesmc v0.16b,v0.16b b .Lenter_cbc_enc128 .Loop_cbc_enc128: aese v0.16b,v16.16b aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 + st1 {v6.16b},[PTR(1)],#16 .Lenter_cbc_enc128: aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -1154,7 +1158,7 @@ aes_v8_cbc_encrypt: aesmc v0.16b,v0.16b aese v0.16b,v19.16b aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 + ld1 {v16.16b},[PTR(0)],x8 aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v0.16b,v21.16b @@ -1166,11 +1170,11 @@ aes_v8_cbc_encrypt: eor v6.16b,v0.16b,v7.16b b.hs .Loop_cbc_enc128 - st1 {v6.16b},[x1],#16 + st1 {v6.16b},[PTR(1)],#16 b .Lcbc_done .align 5 .Lcbc_dec: - ld1 {v24.16b},[x0],#16 + ld1 {v24.16b},[PTR(0)],#16 subs x2,x2,#32 // bias add w6,w5,#2 orr v3.16b,v0.16b,v0.16b @@ -1179,15 +1183,15 @@ aes_v8_cbc_encrypt: b.lo .Lcbc_dec_tail orr v1.16b,v24.16b,v24.16b - ld1 {v24.16b},[x0],#16 + ld1 {v24.16b},[PTR(0)],#16 orr v2.16b,v0.16b,v0.16b orr v3.16b,v1.16b,v1.16b orr v27.16b,v24.16b,v24.16b cmp x2,#32 b.lo .Loop3x_cbc_dec - ld1 {v25.16b},[x0],#16 - ld1 {v26.16b},[x0],#16 + ld1 {v25.16b},[PTR(0)],#16 + ld1 {v26.16b},[PTR(0)],#16 sub x2,x2,#32 // bias mov w6,w5 orr v28.16b,v25.16b,v25.16b @@ -1204,7 +1208,7 @@ aes_v8_cbc_encrypt: aesimc v25.16b,v25.16b aesd v26.16b,v16.16b aesimc v26.16b,v26.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aesd v0.16b,v17.16b aesimc v0.16b,v0.16b @@ -1216,7 +1220,7 @@ aes_v8_cbc_encrypt: aesimc v25.16b,v25.16b aesd v26.16b,v17.16b aesimc v26.16b,v26.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop5x_cbc_dec aesd v0.16b,v16.16b @@ -1243,7 +1247,7 @@ aes_v8_cbc_encrypt: aesd v26.16b,v17.16b aesimc v26.16b,v26.16b csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo - mov x7,x3 + mov PTR(7),PTR(3) aesd v0.16b,v18.16b aesimc v0.16b,v0.16b @@ -1255,7 +1259,7 @@ aes_v8_cbc_encrypt: aesimc v25.16b,v25.16b aesd v26.16b,v18.16b aesimc v26.16b,v26.16b - add x0,x0,x6 // x0 is adjusted in such way that + add PTR(0),PTR(0),x6 // PTR(0) is adjusted in such way that // at exit from the loop v1.16b-v26.16b // are loaded with last "words" add x6,x2,#0x60 // because .Lcbc_tail4x @@ -1307,21 +1311,21 @@ aes_v8_cbc_encrypt: eor v4.16b,v6.16b,v7.16b aesd v0.16b,v23.16b eor v5.16b,v2.16b,v7.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aesd v1.16b,v23.16b eor v17.16b,v3.16b,v7.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aesd v24.16b,v23.16b eor v30.16b,v27.16b,v7.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aesd v25.16b,v23.16b eor v31.16b,v28.16b,v7.16b - ld1 {v28.16b},[x0],#16 + ld1 {v28.16b},[PTR(0)],#16 aesd v26.16b,v23.16b orr v6.16b,v29.16b,v29.16b - ld1 {v29.16b},[x0],#16 + ld1 {v29.16b},[PTR(0)],#16 cbz x6,.Lcbc_tail4x - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] eor v4.16b,v4.16b,v0.16b orr v0.16b,v2.16b,v2.16b eor v5.16b,v5.16b,v1.16b @@ -1331,14 +1335,14 @@ aes_v8_cbc_encrypt: eor v30.16b,v30.16b,v25.16b orr v25.16b,v28.16b,v28.16b eor v31.16b,v31.16b,v26.16b - st1 {v4.16b},[x1],#16 + st1 {v4.16b},[PTR(1)],#16 orr v26.16b,v29.16b,v29.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 mov w6,w5 - st1 {v17.16b},[x1],#16 - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v30.16b},[x1],#16 - st1 {v31.16b},[x1],#16 + st1 {v17.16b},[PTR(1)],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[PTR(1)],#16 + st1 {v31.16b},[PTR(1)],#16 b.hs .Loop5x_cbc_dec add x2,x2,#0x50 @@ -1362,10 +1366,10 @@ aes_v8_cbc_encrypt: eor v17.16b,v17.16b,v24.16b eor v30.16b,v30.16b,v25.16b eor v31.16b,v31.16b,v26.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 - st1 {v30.16b},[x1],#16 - st1 {v31.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v17.16b},[PTR(1)],#16 + st1 {v30.16b},[PTR(1)],#16 + st1 {v31.16b},[PTR(1)],#16 b .Lcbc_done .align 4 @@ -1376,7 +1380,7 @@ aes_v8_cbc_encrypt: aesimc v1.16b,v1.16b aesd v24.16b,v16.16b aesimc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aesd v0.16b,v17.16b aesimc v0.16b,v0.16b @@ -1384,7 +1388,7 @@ aes_v8_cbc_encrypt: aesimc v1.16b,v1.16b aesd v24.16b,v17.16b aesimc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop3x_cbc_dec aesd v0.16b,v16.16b @@ -1404,46 +1408,46 @@ aes_v8_cbc_encrypt: aesd v24.16b,v17.16b aesimc v24.16b,v24.16b eor v17.16b,v3.16b,v7.16b - add x0,x0,x6 // x0 is adjusted in such way that + add PTR(0),PTR(0),x6 // PTR(0) is adjusted in such way that // at exit from the loop v1.16b-v24.16b // are loaded with last "words" orr v6.16b,v27.16b,v27.16b - mov x7,x3 + mov PTR(7),PTR(3) aesd v0.16b,v20.16b aesimc v0.16b,v0.16b aesd v1.16b,v20.16b aesimc v1.16b,v1.16b aesd v24.16b,v20.16b aesimc v24.16b,v24.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aesd v0.16b,v21.16b aesimc v0.16b,v0.16b aesd v1.16b,v21.16b aesimc v1.16b,v1.16b aesd v24.16b,v21.16b aesimc v24.16b,v24.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aesd v0.16b,v22.16b aesimc v0.16b,v0.16b aesd v1.16b,v22.16b aesimc v1.16b,v1.16b aesd v24.16b,v22.16b aesimc v24.16b,v24.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aesd v0.16b,v23.16b aesd v1.16b,v23.16b aesd v24.16b,v23.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] add w6,w5,#2 eor v4.16b,v4.16b,v0.16b eor v5.16b,v5.16b,v1.16b eor v24.16b,v24.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v4.16b},[x1],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[PTR(1)],#16 orr v0.16b,v2.16b,v2.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 orr v1.16b,v3.16b,v3.16b - st1 {v24.16b},[x1],#16 + st1 {v24.16b},[PTR(1)],#16 orr v24.16b,v27.16b,v27.16b b.hs .Loop3x_cbc_dec @@ -1456,13 +1460,13 @@ aes_v8_cbc_encrypt: aesimc v1.16b,v1.16b aesd v24.16b,v16.16b aesimc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aesd v1.16b,v17.16b aesimc v1.16b,v1.16b aesd v24.16b,v17.16b aesimc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Lcbc_dec_tail aesd v1.16b,v16.16b @@ -1494,19 +1498,19 @@ aes_v8_cbc_encrypt: eor v5.16b,v5.16b,v1.16b eor v17.16b,v17.16b,v24.16b orr v6.16b,v27.16b,v27.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v17.16b},[PTR(1)],#16 b .Lcbc_done .Lcbc_dec_one: eor v5.16b,v5.16b,v24.16b orr v6.16b,v27.16b,v27.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 .Lcbc_done: - st1 {v6.16b},[x4] + st1 {v6.16b},[PTR(4)] .Lcbc_abort: - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt .globl aes_v8_ctr32_encrypt_blocks @@ -1515,26 +1519,26 @@ aes_v8_cbc_encrypt: aes_v8_ctr32_encrypt_blocks: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - ldr w5,[x3,#240] + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + ldr w5,[PTR(3),#240] - ldr w8, [x4, #12] + ldr w8, [PTR(4), #12] #ifdef __AARCH64EB__ - ld1 {v0.16b},[x4] + ld1 {v0.16b},[PTR(4)] #else - ld1 {v0.4s},[x4] + ld1 {v0.4s},[PTR(4)] #endif - ld1 {v16.4s,v17.4s},[x3] // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)] // load key schedule... sub w5,w5,#4 mov x12,#16 cmp x2,#2 - add x7,x3,x5,lsl#4 // pointer to last 5 round keys + add PTR(7),PTR(3),x5,lsl#4 // pointer to last 5 round keys sub w5,w5,#2 - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - add x7,x3,#32 + ld1 {v20.4s,v21.4s},[PTR(7)],#32 + ld1 {v22.4s,v23.4s},[PTR(7)],#32 + ld1 {v7.4s},[PTR(7)] + add PTR(7),PTR(3),#32 mov w6,w5 csel x12,xzr,x12,lo #ifndef __AARCH64EB__ @@ -1578,7 +1582,7 @@ aes_v8_ctr32_encrypt_blocks: aesmc v24.16b,v24.16b aese v25.16b,v16.16b aesmc v25.16b,v25.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -1590,10 +1594,10 @@ aes_v8_ctr32_encrypt_blocks: aesmc v24.16b,v24.16b aese v25.16b,v17.16b aesmc v25.16b,v25.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop5x_ctr32 - mov x7,x3 + mov PTR(7),PTR(3) aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v1.16b,v16.16b @@ -1604,7 +1608,7 @@ aes_v8_ctr32_encrypt_blocks: aesmc v24.16b,v24.16b aese v25.16b,v16.16b aesmc v25.16b,v25.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -1616,7 +1620,7 @@ aes_v8_ctr32_encrypt_blocks: aesmc v24.16b,v24.16b aese v25.16b,v17.16b aesmc v25.16b,v25.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] aese v0.16b,v20.16b aesmc v0.16b,v0.16b @@ -1652,19 +1656,19 @@ aes_v8_ctr32_encrypt_blocks: aese v0.16b,v22.16b aesmc v0.16b,v0.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aese v1.16b,v22.16b aesmc v1.16b,v1.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aese v18.16b,v22.16b aesmc v18.16b,v18.16b - ld1 {v19.16b},[x0],#16 + ld1 {v19.16b},[PTR(0)],#16 aese v24.16b,v22.16b aesmc v24.16b,v24.16b - ld1 {v26.16b},[x0],#16 + ld1 {v26.16b},[PTR(0)],#16 aese v25.16b,v22.16b aesmc v25.16b,v25.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aese v0.16b,v23.16b eor v2.16b,v2.16b,v7.16b @@ -1688,15 +1692,15 @@ aes_v8_ctr32_encrypt_blocks: eor v27.16b,v27.16b,v25.16b orr v25.16b,v6.16b,v6.16b - st1 {v2.16b},[x1],#16 + st1 {v2.16b},[PTR(1)],#16 mov v0.s[3],w9 - st1 {v3.16b},[x1],#16 + st1 {v3.16b},[PTR(1)],#16 mov v1.s[3],w10 - st1 {v19.16b},[x1],#16 + st1 {v19.16b},[PTR(1)],#16 mov v18.s[3],w12 - st1 {v26.16b},[x1],#16 + st1 {v26.16b},[PTR(1)],#16 mov v24.s[3],w13 - st1 {v27.16b},[x1],#16 + st1 {v27.16b},[PTR(1)],#16 mov v25.s[3],w14 mov w6,w5 @@ -1726,7 +1730,7 @@ aes_v8_ctr32_encrypt_blocks: aesmc v1.16b,v1.16b aese v18.16b,v16.16b aesmc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -1734,25 +1738,25 @@ aes_v8_ctr32_encrypt_blocks: aesmc v1.16b,v1.16b aese v18.16b,v17.16b aesmc v18.16b,v18.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop3x_ctr32 aese v0.16b,v16.16b aesmc v4.16b,v0.16b aese v1.16b,v16.16b aesmc v5.16b,v1.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 orr v0.16b,v6.16b,v6.16b aese v18.16b,v16.16b aesmc v18.16b,v18.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 orr v1.16b,v6.16b,v6.16b aese v4.16b,v17.16b aesmc v4.16b,v4.16b aese v5.16b,v17.16b aesmc v5.16b,v5.16b - ld1 {v19.16b},[x0],#16 - mov x7,x3 + ld1 {v19.16b},[PTR(0)],#16 + mov PTR(7),PTR(3) aese v18.16b,v17.16b aesmc v17.16b,v18.16b orr v18.16b,v6.16b,v6.16b @@ -1792,14 +1796,14 @@ aes_v8_ctr32_encrypt_blocks: aese v17.16b,v23.16b eor v2.16b,v2.16b,v4.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - st1 {v2.16b},[x1],#16 + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[PTR(1)],#16 eor v3.16b,v3.16b,v5.16b mov w6,w5 - st1 {v3.16b},[x1],#16 + st1 {v3.16b},[PTR(1)],#16 eor v19.16b,v19.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v19.16b},[x1],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[PTR(1)],#16 b.hs .Loop3x_ctr32 adds x2,x2,#3 @@ -1813,13 +1817,13 @@ aes_v8_ctr32_encrypt_blocks: aesmc v0.16b,v0.16b aese v1.16b,v16.16b aesmc v1.16b,v1.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Lctr32_tail aese v0.16b,v16.16b @@ -1830,12 +1834,12 @@ aes_v8_ctr32_encrypt_blocks: aesmc v0.16b,v0.16b aese v1.16b,v17.16b aesmc v1.16b,v1.16b - ld1 {v2.16b},[x0],x12 + ld1 {v2.16b},[PTR(0)],x12 aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v1.16b,v20.16b aesmc v1.16b,v1.16b - ld1 {v3.16b},[x0] + ld1 {v3.16b},[PTR(0)] aese v0.16b,v21.16b aesmc v0.16b,v0.16b aese v1.16b,v21.16b @@ -1852,12 +1856,12 @@ aes_v8_ctr32_encrypt_blocks: cmp x2,#1 eor v2.16b,v2.16b,v0.16b eor v3.16b,v3.16b,v1.16b - st1 {v2.16b},[x1],#16 + st1 {v2.16b},[PTR(1)],#16 b.eq .Lctr32_done - st1 {v3.16b},[x1] + st1 {v3.16b},[PTR(1)] .Lctr32_done: - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks .globl aes_v8_xts_encrypt @@ -1869,37 +1873,37 @@ aes_v8_xts_encrypt: // Original input data size bigger than 16, jump to big size processing. b.ne .Lxts_enc_big_size // Encrypt the iv with key2, as the first XEX iv. - ldr w6,[x4,#240] - ld1 {v0.4s},[x4],#16 - ld1 {v6.16b},[x5] + ldr w6,[PTR(4),#240] + ld1 {v0.4s},[PTR(4)],#16 + ld1 {v6.16b},[PTR(5)] sub w6,w6,#2 - ld1 {v1.4s},[x4],#16 + ld1 {v1.4s},[PTR(4)],#16 .Loop_enc_iv_enc: aese v6.16b,v0.16b aesmc v6.16b,v6.16b - ld1 {v0.4s},[x4],#16 + ld1 {v0.4s},[PTR(4)],#16 subs w6,w6,#2 aese v6.16b,v1.16b aesmc v6.16b,v6.16b - ld1 {v1.4s},[x4],#16 + ld1 {v1.4s},[PTR(4)],#16 b.gt .Loop_enc_iv_enc aese v6.16b,v0.16b aesmc v6.16b,v6.16b - ld1 {v0.4s},[x4] + ld1 {v0.4s},[PTR(4)] aese v6.16b,v1.16b eor v6.16b,v6.16b,v0.16b - ld1 {v0.16b},[x0] + ld1 {v0.16b},[PTR(0)] eor v0.16b,v6.16b,v0.16b - ldr w6,[x3,#240] - ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... + ldr w6,[PTR(3),#240] + ld1 {v28.4s,v29.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v28.16b aesmc v0.16b,v0.16b - ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v29.16b aesmc v0.16b,v0.16b subs w6,w6,#10 // if rounds==10, jump to aes-128-xts processing @@ -1907,43 +1911,43 @@ aes_v8_xts_encrypt: .Lxts_enc_round_loop: aese v0.16b,v16.16b aesmc v0.16b,v0.16b - ld1 {v16.4s},[x3],#16 // load key schedule... + ld1 {v16.4s},[PTR(3)],#16 // load key schedule... aese v0.16b,v17.16b aesmc v0.16b,v0.16b - ld1 {v17.4s},[x3],#16 // load key schedule... + ld1 {v17.4s},[PTR(3)],#16 // load key schedule... subs w6,w6,#2 // bias b.gt .Lxts_enc_round_loop .Lxts_128_enc: - ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... + ld1 {v18.4s,v19.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v16.16b aesmc v0.16b,v0.16b aese v0.16b,v17.16b aesmc v0.16b,v0.16b - ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... + ld1 {v20.4s,v21.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v18.16b aesmc v0.16b,v0.16b aese v0.16b,v19.16b aesmc v0.16b,v0.16b - ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... + ld1 {v22.4s,v23.4s},[PTR(3)],#32 // load key schedule... aese v0.16b,v20.16b aesmc v0.16b,v0.16b aese v0.16b,v21.16b aesmc v0.16b,v0.16b - ld1 {v7.4s},[x3] + ld1 {v7.4s},[PTR(3)] aese v0.16b,v22.16b aesmc v0.16b,v0.16b aese v0.16b,v23.16b eor v0.16b,v0.16b,v7.16b eor v0.16b,v0.16b,v6.16b - st1 {v0.16b},[x1] + st1 {v0.16b},[PTR(1)] b .Lxts_enc_final_abort .align 4 .Lxts_enc_big_size: - stp x19,x20,[sp,#-64]! - stp x21,x22,[sp,#48] - stp d8,d9,[sp,#32] - stp d10,d11,[sp,#16] + stp PTR(19),PTR(20),[PTRN(sp),#-(4*PTR_WIDTH+32)]! + stp PTR(21),PTR(22),[PTRN(sp),#(2*PTR_WIDTH)] + stp d8,d9,[PTRN(sp),#(4*PTR_WIDTH)] + stp d10,d11,[PTRN(sp),#(4*PTR_WIDTH+16)] // tailcnt store the tail value of length%16. and x21,x2,#0xf @@ -1954,25 +1958,25 @@ aes_v8_xts_encrypt: csel x8,xzr,x8,eq // Firstly, encrypt the iv with key2, as the first iv of XEX. - ldr w6,[x4,#240] - ld1 {v0.4s},[x4],#16 - ld1 {v6.16b},[x5] + ldr w6,[PTR(4),#240] + ld1 {v0.4s},[PTR(4)],#16 + ld1 {v6.16b},[PTR(5)] sub w6,w6,#2 - ld1 {v1.4s},[x4],#16 + ld1 {v1.4s},[PTR(4)],#16 .Loop_iv_enc: aese v6.16b,v0.16b aesmc v6.16b,v6.16b - ld1 {v0.4s},[x4],#16 + ld1 {v0.4s},[PTR(4)],#16 subs w6,w6,#2 aese v6.16b,v1.16b aesmc v6.16b,v6.16b - ld1 {v1.4s},[x4],#16 + ld1 {v1.4s},[PTR(4)],#16 b.gt .Loop_iv_enc aese v6.16b,v0.16b aesmc v6.16b,v6.16b - ld1 {v0.4s},[x4] + ld1 {v0.4s},[PTR(4)] aese v6.16b,v1.16b eor v6.16b,v6.16b,v0.16b @@ -1989,24 +1993,24 @@ aes_v8_xts_encrypt: fmov d8,x9 fmov v8.d[1],x10 - ldr w5,[x3,#240] // next starting point - ld1 {v0.16b},[x0],x8 + ldr w5,[PTR(3),#240] // next starting point + ld1 {v0.16b},[PTR(0)],x8 - ld1 {v16.4s,v17.4s},[x3] // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)] // load key schedule... sub w5,w5,#6 - add x7,x3,x5,lsl#4 // pointer to last 7 round keys + add PTR(7),PTR(3),x5,lsl#4 // pointer to last 7 round keys sub w5,w5,#2 - ld1 {v18.4s,v19.4s},[x7],#32 - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] + ld1 {v18.4s,v19.4s},[PTR(7)],#32 + ld1 {v20.4s,v21.4s},[PTR(7)],#32 + ld1 {v22.4s,v23.4s},[PTR(7)],#32 + ld1 {v7.4s},[PTR(7)] - add x7,x3,#32 + add PTR(7),PTR(3),#32 mov w6,w5 // Encryption .Lxts_enc: - ld1 {v24.16b},[x0],#16 + ld1 {v24.16b},[PTR(0)],#16 subs x2,x2,#32 // bias add w6,w5,#2 orr v3.16b,v0.16b,v0.16b @@ -2028,7 +2032,7 @@ aes_v8_xts_encrypt: orr v1.16b,v24.16b,v24.16b - ld1 {v24.16b},[x0],#16 + ld1 {v24.16b},[PTR(0)],#16 orr v2.16b,v0.16b,v0.16b orr v3.16b,v1.16b,v1.16b eor v27.16b,v24.16b,v9.16b // the third block @@ -2044,7 +2048,7 @@ aes_v8_xts_encrypt: fmov d10,x9 fmov v10.d[1],x10 - ld1 {v25.16b},[x0],#16 + ld1 {v25.16b},[PTR(0)],#16 // The iv for fifth block extr x22,x10,x10,#32 extr x10,x10,x9,#63 @@ -2053,7 +2057,7 @@ aes_v8_xts_encrypt: fmov d11,x9 fmov v11.d[1],x10 - ld1 {v26.16b},[x0],#16 + ld1 {v26.16b},[PTR(0)],#16 eor v25.16b,v25.16b,v10.16b // the fourth block eor v26.16b,v26.16b,v11.16b sub x2,x2,#32 // bias @@ -2072,7 +2076,7 @@ aes_v8_xts_encrypt: aesmc v25.16b,v25.16b aese v26.16b,v16.16b aesmc v26.16b,v26.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -2084,7 +2088,7 @@ aes_v8_xts_encrypt: aesmc v25.16b,v25.16b aese v26.16b,v17.16b aesmc v26.16b,v26.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Loop5x_xts_enc aese v0.16b,v16.16b @@ -2110,7 +2114,7 @@ aes_v8_xts_encrypt: aese v26.16b,v17.16b aesmc v26.16b,v26.16b csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo - mov x7,x3 + mov PTR(7),PTR(3) aese v0.16b,v18.16b aesmc v0.16b,v0.16b @@ -2122,7 +2126,7 @@ aes_v8_xts_encrypt: aesmc v25.16b,v25.16b aese v26.16b,v18.16b aesmc v26.16b,v26.16b - add x0,x0,x6 // x0 is adjusted in such way that + add PTR(0),PTR(0),x6 // x0 is adjusted in such way that // at exit from the loop v1.16b-v26.16b // are loaded with last "words" add x6,x2,#0x60 // because .Lxts_enc_tail4x @@ -2181,7 +2185,7 @@ aes_v8_xts_encrypt: fmov d6,x9 fmov v6.d[1],x10 eor v5.16b,v7.16b,v8.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aese v1.16b,v23.16b // The iv for second block extr x22,x10,x10,#32 @@ -2191,7 +2195,7 @@ aes_v8_xts_encrypt: fmov d8,x9 fmov v8.d[1],x10 eor v17.16b,v7.16b,v9.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aese v24.16b,v23.16b // The iv for third block extr x22,x10,x10,#32 @@ -2201,7 +2205,7 @@ aes_v8_xts_encrypt: fmov d9,x9 fmov v9.d[1],x10 eor v30.16b,v7.16b,v10.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aese v25.16b,v23.16b // The iv for fourth block extr x22,x10,x10,#32 @@ -2211,7 +2215,7 @@ aes_v8_xts_encrypt: fmov d10,x9 fmov v10.d[1],x10 eor v31.16b,v7.16b,v11.16b - ld1 {v28.16b},[x0],#16 + ld1 {v28.16b},[PTR(0)],#16 aese v26.16b,v23.16b // The iv for fifth block @@ -2222,9 +2226,9 @@ aes_v8_xts_encrypt: fmov d11,x9 fmov v11.d[1],x10 - ld1 {v29.16b},[x0],#16 + ld1 {v29.16b},[PTR(0)],#16 cbz x6,.Lxts_enc_tail4x - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] eor v4.16b,v4.16b,v0.16b eor v0.16b,v2.16b,v6.16b eor v5.16b,v5.16b,v1.16b @@ -2234,14 +2238,14 @@ aes_v8_xts_encrypt: eor v30.16b,v30.16b,v25.16b eor v25.16b,v28.16b,v10.16b eor v31.16b,v31.16b,v26.16b - st1 {v4.16b},[x1],#16 + st1 {v4.16b},[PTR(1)],#16 eor v26.16b,v29.16b,v11.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 mov w6,w5 - st1 {v17.16b},[x1],#16 - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v30.16b},[x1],#16 - st1 {v31.16b},[x1],#16 + st1 {v17.16b},[PTR(1)],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[PTR(1)],#16 + st1 {v31.16b},[PTR(1)],#16 b.hs .Loop5x_xts_enc @@ -2276,14 +2280,14 @@ aes_v8_xts_encrypt: .align 4 .Lxts_enc_tail4x: - add x0,x0,#16 + add PTR(0),PTR(0),#16 eor v5.16b,v1.16b,v5.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 eor v17.16b,v24.16b,v17.16b - st1 {v17.16b},[x1],#16 + st1 {v17.16b},[PTR(1)],#16 eor v30.16b,v25.16b,v30.16b eor v31.16b,v26.16b,v31.16b - st1 {v30.16b,v31.16b},[x1],#32 + st1 {v30.16b,v31.16b},[PTR(1)],#32 b .Lxts_enc_done .align 4 @@ -2294,7 +2298,7 @@ aes_v8_xts_encrypt: aesmc v1.16b,v1.16b aese v24.16b,v16.16b aesmc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v0.16b,v17.16b aesmc v0.16b,v0.16b @@ -2302,7 +2306,7 @@ aes_v8_xts_encrypt: aesmc v1.16b,v1.16b aese v24.16b,v17.16b aesmc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Lxts_outer_enc_tail aese v0.16b,v16.16b @@ -2334,8 +2338,8 @@ aes_v8_xts_encrypt: eor v17.16b,v9.16b,v7.16b add x6,x6,#0x20 - add x0,x0,x6 - mov x7,x3 + add PTR(0),PTR(0),x6 + mov PTR(7),PTR(3) aese v0.16b,v20.16b aesmc v0.16b,v0.16b @@ -2358,16 +2362,16 @@ aes_v8_xts_encrypt: aese v0.16b,v23.16b aese v1.16b,v23.16b aese v24.16b,v23.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 add w6,w5,#2 - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] eor v4.16b,v4.16b,v0.16b eor v5.16b,v5.16b,v1.16b eor v24.16b,v24.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v4.16b},[x1],#16 - st1 {v5.16b},[x1],#16 - st1 {v24.16b},[x1],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[PTR(1)],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v24.16b},[PTR(1)],#16 cmn x2,#0x30 b.eq .Lxts_enc_done .Lxts_encxor_one: @@ -2386,13 +2390,13 @@ aes_v8_xts_encrypt: aesmc v1.16b,v1.16b aese v24.16b,v16.16b aesmc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aese v1.16b,v17.16b aesmc v1.16b,v1.16b aese v24.16b,v17.16b aesmc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Lxts_enc_tail_loop aese v1.16b,v16.16b @@ -2422,10 +2426,10 @@ aes_v8_xts_encrypt: aese v24.16b,v23.16b b.eq .Lxts_enc_one eor v5.16b,v5.16b,v1.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 eor v17.16b,v17.16b,v24.16b orr v6.16b,v8.16b,v8.16b - st1 {v17.16b},[x1],#16 + st1 {v17.16b},[PTR(1)],#16 fmov x9,d8 fmov x10,v8.d[1] mov w19,#0x87 @@ -2440,7 +2444,7 @@ aes_v8_xts_encrypt: .Lxts_enc_one: eor v5.16b,v5.16b,v24.16b orr v6.16b,v6.16b,v6.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 fmov x9,d6 fmov x10,v6.d[1] mov w19,#0x87 @@ -2457,48 +2461,48 @@ aes_v8_xts_encrypt: tst x21,#0xf b.eq .Lxts_abort - mov x20,x0 - mov x13,x1 - sub x1,x1,#16 + mov PTR(20),PTR(0) + mov PTR(13),PTR(1) + sub PTR(1),PTR(1),#16 .composite_enc_loop: subs x21,x21,#1 - ldrb w15,[x1,x21] - ldrb w14,[x20,x21] - strb w15,[x13,x21] - strb w14,[x1,x21] + ldrb w15,[PTR(1),x21] + ldrb w14,[PTR(20),x21] + strb w15,[PTR(13),x21] + strb w14,[PTR(1),x21] b.gt .composite_enc_loop .Lxts_enc_load_done: - ld1 {v26.16b},[x1] + ld1 {v26.16b},[PTR(1)] eor v26.16b,v26.16b,v6.16b // Encrypt the composite block to get the last second encrypted text block - ldr w6,[x3,#240] // load key schedule... - ld1 {v0.4s},[x3],#16 + ldr w6,[PTR(3),#240] // load key schedule... + ld1 {v0.4s},[PTR(3)],#16 sub w6,w6,#2 - ld1 {v1.4s},[x3],#16 // load key schedule... + ld1 {v1.4s},[PTR(3)],#16 // load key schedule... .Loop_final_enc: aese v26.16b,v0.16b aesmc v26.16b,v26.16b - ld1 {v0.4s},[x3],#16 + ld1 {v0.4s},[PTR(3)],#16 subs w6,w6,#2 aese v26.16b,v1.16b aesmc v26.16b,v26.16b - ld1 {v1.4s},[x3],#16 + ld1 {v1.4s},[PTR(3)],#16 b.gt .Loop_final_enc aese v26.16b,v0.16b aesmc v26.16b,v26.16b - ld1 {v0.4s},[x3] + ld1 {v0.4s},[PTR(3)] aese v26.16b,v1.16b eor v26.16b,v26.16b,v0.16b eor v26.16b,v26.16b,v6.16b - st1 {v26.16b},[x1] + st1 {v26.16b},[PTR(1)] .Lxts_abort: - ldp x21,x22,[sp,#48] - ldp d8,d9,[sp,#32] - ldp d10,d11,[sp,#16] - ldp x19,x20,[sp],#64 + ldp PTR(21),PTR(22),[PTRN(sp),#(2*PTR_WIDTH)] + ldp d8,d9,[PTRN(sp),#(4*PTR_WIDTH)] + ldp d10,d11,[PTRN(sp),#(4*PTR_WIDTH+16)] + ldp PTR(19),PTR(20),[PTRN(sp)],#(4*PTR_WIDTH+32) .Lxts_enc_final_abort: ret .size aes_v8_xts_encrypt,.-aes_v8_xts_encrypt @@ -2511,37 +2515,37 @@ aes_v8_xts_decrypt: // Original input data size bigger than 16, jump to big size processing. b.ne .Lxts_dec_big_size // Encrypt the iv with key2, as the first XEX iv. - ldr w6,[x4,#240] - ld1 {v0.4s},[x4],#16 - ld1 {v6.16b},[x5] + ldr w6,[PTR(4),#240] + ld1 {v0.4s},[PTR(4)],#16 + ld1 {v6.16b},[PTR(5)] sub w6,w6,#2 - ld1 {v1.4s},[x4],#16 + ld1 {v1.4s},[PTR(4)],#16 .Loop_dec_small_iv_enc: aese v6.16b,v0.16b aesmc v6.16b,v6.16b - ld1 {v0.4s},[x4],#16 + ld1 {v0.4s},[PTR(4)],#16 subs w6,w6,#2 aese v6.16b,v1.16b aesmc v6.16b,v6.16b - ld1 {v1.4s},[x4],#16 + ld1 {v1.4s},[PTR(4)],#16 b.gt .Loop_dec_small_iv_enc aese v6.16b,v0.16b aesmc v6.16b,v6.16b - ld1 {v0.4s},[x4] + ld1 {v0.4s},[PTR(4)] aese v6.16b,v1.16b eor v6.16b,v6.16b,v0.16b - ld1 {v0.16b},[x0] + ld1 {v0.16b},[PTR(0)] eor v0.16b,v6.16b,v0.16b - ldr w6,[x3,#240] - ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... + ldr w6,[PTR(3),#240] + ld1 {v28.4s,v29.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v28.16b aesimc v0.16b,v0.16b - ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v29.16b aesimc v0.16b,v0.16b subs w6,w6,#10 // bias @@ -2549,41 +2553,41 @@ aes_v8_xts_decrypt: .Lxts_dec_round_loop: aesd v0.16b,v16.16b aesimc v0.16b,v0.16b - ld1 {v16.4s},[x3],#16 // load key schedule... + ld1 {v16.4s},[PTR(3)],#16 // load key schedule... aesd v0.16b,v17.16b aesimc v0.16b,v0.16b - ld1 {v17.4s},[x3],#16 // load key schedule... + ld1 {v17.4s},[PTR(3)],#16 // load key schedule... subs w6,w6,#2 // bias b.gt .Lxts_dec_round_loop .Lxts_128_dec: - ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... + ld1 {v18.4s,v19.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v16.16b aesimc v0.16b,v0.16b aesd v0.16b,v17.16b aesimc v0.16b,v0.16b - ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... + ld1 {v20.4s,v21.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v18.16b aesimc v0.16b,v0.16b aesd v0.16b,v19.16b aesimc v0.16b,v0.16b - ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... + ld1 {v22.4s,v23.4s},[PTR(3)],#32 // load key schedule... aesd v0.16b,v20.16b aesimc v0.16b,v0.16b aesd v0.16b,v21.16b aesimc v0.16b,v0.16b - ld1 {v7.4s},[x3] + ld1 {v7.4s},[PTR(3)] aesd v0.16b,v22.16b aesimc v0.16b,v0.16b aesd v0.16b,v23.16b eor v0.16b,v0.16b,v7.16b eor v0.16b,v6.16b,v0.16b - st1 {v0.16b},[x1] + st1 {v0.16b},[PTR(1)] b .Lxts_dec_final_abort .Lxts_dec_big_size: - stp x19,x20,[sp,#-64]! - stp x21,x22,[sp,#48] - stp d8,d9,[sp,#32] - stp d10,d11,[sp,#16] + stp PTR(19),PTR(20),[PTRN(sp),#-(4*PTR_WIDTH+32)]! + stp PTR(21),PTR(22),[PTRN(sp),#(2*PTR_WIDTH)] + stp d8,d9,[PTRN(sp),#(4*PTR_WIDTH)] + stp d10,d11,[PTRN(sp),#(4*PTR_WIDTH+16)] and x21,x2,#0xf and x2,x2,#-16 @@ -2592,25 +2596,25 @@ aes_v8_xts_decrypt: b.lo .Lxts_dec_abort // Encrypt the iv with key2, as the first XEX iv - ldr w6,[x4,#240] - ld1 {v0.4s},[x4],#16 - ld1 {v6.16b},[x5] + ldr w6,[PTR(4),#240] + ld1 {v0.4s},[PTR(4)],#16 + ld1 {v6.16b},[PTR(5)] sub w6,w6,#2 - ld1 {v1.4s},[x4],#16 + ld1 {v1.4s},[PTR(4)],#16 .Loop_dec_iv_enc: aese v6.16b,v0.16b aesmc v6.16b,v6.16b - ld1 {v0.4s},[x4],#16 + ld1 {v0.4s},[PTR(4)],#16 subs w6,w6,#2 aese v6.16b,v1.16b aesmc v6.16b,v6.16b - ld1 {v1.4s},[x4],#16 + ld1 {v1.4s},[PTR(4)],#16 b.gt .Loop_dec_iv_enc aese v6.16b,v0.16b aesmc v6.16b,v6.16b - ld1 {v0.4s},[x4] + ld1 {v0.4s},[PTR(4)] aese v6.16b,v1.16b eor v6.16b,v6.16b,v0.16b @@ -2627,7 +2631,7 @@ aes_v8_xts_decrypt: fmov d8,x9 fmov v8.d[1],x10 - ldr w5,[x3,#240] // load rounds number + ldr w5,[PTR(3),#240] // load rounds number // The iv for third block extr x22,x10,x10,#32 @@ -2637,14 +2641,14 @@ aes_v8_xts_decrypt: fmov d9,x9 fmov v9.d[1],x10 - ld1 {v16.4s,v17.4s},[x3] // load key schedule... + ld1 {v16.4s,v17.4s},[PTR(3)] // load key schedule... sub w5,w5,#6 - add x7,x3,x5,lsl#4 // pointer to last 7 round keys + add PTR(7),PTR(3),x5,lsl#4 // pointer to last 7 round keys sub w5,w5,#2 - ld1 {v18.4s,v19.4s},[x7],#32 // load key schedule... - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] + ld1 {v18.4s,v19.4s},[PTR(7)],#32 // load key schedule... + ld1 {v20.4s,v21.4s},[PTR(7)],#32 + ld1 {v22.4s,v23.4s},[PTR(7)],#32 + ld1 {v7.4s},[PTR(7)] // The iv for fourth block extr x22,x10,x10,#32 @@ -2654,7 +2658,7 @@ aes_v8_xts_decrypt: fmov d10,x9 fmov v10.d[1],x10 - add x7,x3,#32 + add PTR(7),PTR(3),#32 mov w6,w5 b .Lxts_dec @@ -2665,17 +2669,17 @@ aes_v8_xts_decrypt: b.eq .Lxts_dec_begin subs x2,x2,#16 csel x8,xzr,x8,eq - ld1 {v0.16b},[x0],#16 + ld1 {v0.16b},[PTR(0)],#16 b.lo .Lxts_done - sub x0,x0,#16 + sub PTR(0),PTR(0),#16 .Lxts_dec_begin: - ld1 {v0.16b},[x0],x8 + ld1 {v0.16b},[PTR(0)],x8 subs x2,x2,#32 // bias add w6,w5,#2 orr v3.16b,v0.16b,v0.16b orr v1.16b,v0.16b,v0.16b orr v28.16b,v0.16b,v0.16b - ld1 {v24.16b},[x0],#16 + ld1 {v24.16b},[PTR(0)],#16 orr v27.16b,v24.16b,v24.16b orr v29.16b,v24.16b,v24.16b b.lo .Lxts_inner_dec_tail @@ -2683,7 +2687,7 @@ aes_v8_xts_decrypt: eor v24.16b,v24.16b,v8.16b orr v1.16b,v24.16b,v24.16b - ld1 {v24.16b},[x0],#16 + ld1 {v24.16b},[PTR(0)],#16 orr v2.16b,v0.16b,v0.16b orr v3.16b,v1.16b,v1.16b eor v27.16b,v24.16b,v9.16b // third block xox with third iv @@ -2691,7 +2695,7 @@ aes_v8_xts_decrypt: cmp x2,#32 b.lo .Lxts_outer_dec_tail - ld1 {v25.16b},[x0],#16 + ld1 {v25.16b},[PTR(0)],#16 // The iv for fifth block extr x22,x10,x10,#32 @@ -2701,7 +2705,7 @@ aes_v8_xts_decrypt: fmov d11,x9 fmov v11.d[1],x10 - ld1 {v26.16b},[x0],#16 + ld1 {v26.16b},[PTR(0)],#16 eor v25.16b,v25.16b,v10.16b // the fourth block eor v26.16b,v26.16b,v11.16b sub x2,x2,#32 // bias @@ -2720,7 +2724,7 @@ aes_v8_xts_decrypt: aesimc v25.16b,v25.16b aesd v26.16b,v16.16b aesimc v26.16b,v26.16b - ld1 {v16.4s},[x7],#16 // load key schedule... + ld1 {v16.4s},[PTR(7)],#16 // load key schedule... subs w6,w6,#2 aesd v0.16b,v17.16b aesimc v0.16b,v0.16b @@ -2732,7 +2736,7 @@ aes_v8_xts_decrypt: aesimc v25.16b,v25.16b aesd v26.16b,v17.16b aesimc v26.16b,v26.16b - ld1 {v17.4s},[x7],#16 // load key schedule... + ld1 {v17.4s},[PTR(7)],#16 // load key schedule... b.gt .Loop5x_xts_dec aesd v0.16b,v16.16b @@ -2758,7 +2762,7 @@ aes_v8_xts_decrypt: aesd v26.16b,v17.16b aesimc v26.16b,v26.16b csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo - mov x7,x3 + mov PTR(7),PTR(3) aesd v0.16b,v18.16b aesimc v0.16b,v0.16b @@ -2770,7 +2774,7 @@ aes_v8_xts_decrypt: aesimc v25.16b,v25.16b aesd v26.16b,v18.16b aesimc v26.16b,v26.16b - add x0,x0,x6 // x0 is adjusted in such way that + add PTR(0),PTR(0),x6 // x0 is adjusted in such way that // at exit from the loop v1.16b-v26.16b // are loaded with last "words" add x6,x2,#0x60 // because .Lxts_dec_tail4x @@ -2829,7 +2833,7 @@ aes_v8_xts_decrypt: fmov d6,x9 fmov v6.d[1],x10 eor v5.16b,v7.16b,v8.16b - ld1 {v2.16b},[x0],#16 + ld1 {v2.16b},[PTR(0)],#16 aesd v1.16b,v23.16b // The iv for second block extr x22,x10,x10,#32 @@ -2839,7 +2843,7 @@ aes_v8_xts_decrypt: fmov d8,x9 fmov v8.d[1],x10 eor v17.16b,v7.16b,v9.16b - ld1 {v3.16b},[x0],#16 + ld1 {v3.16b},[PTR(0)],#16 aesd v24.16b,v23.16b // The iv for third block extr x22,x10,x10,#32 @@ -2849,7 +2853,7 @@ aes_v8_xts_decrypt: fmov d9,x9 fmov v9.d[1],x10 eor v30.16b,v7.16b,v10.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aesd v25.16b,v23.16b // The iv for fourth block extr x22,x10,x10,#32 @@ -2859,7 +2863,7 @@ aes_v8_xts_decrypt: fmov d10,x9 fmov v10.d[1],x10 eor v31.16b,v7.16b,v11.16b - ld1 {v28.16b},[x0],#16 + ld1 {v28.16b},[PTR(0)],#16 aesd v26.16b,v23.16b // The iv for fifth block @@ -2870,9 +2874,9 @@ aes_v8_xts_decrypt: fmov d11,x9 fmov v11.d[1],x10 - ld1 {v29.16b},[x0],#16 + ld1 {v29.16b},[PTR(0)],#16 cbz x6,.Lxts_dec_tail4x - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] eor v4.16b,v4.16b,v0.16b eor v0.16b,v2.16b,v6.16b eor v5.16b,v5.16b,v1.16b @@ -2882,14 +2886,14 @@ aes_v8_xts_decrypt: eor v30.16b,v30.16b,v25.16b eor v25.16b,v28.16b,v10.16b eor v31.16b,v31.16b,v26.16b - st1 {v4.16b},[x1],#16 + st1 {v4.16b},[PTR(1)],#16 eor v26.16b,v29.16b,v11.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 mov w6,w5 - st1 {v17.16b},[x1],#16 - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v30.16b},[x1],#16 - st1 {v31.16b},[x1],#16 + st1 {v17.16b},[PTR(1)],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v30.16b},[PTR(1)],#16 + st1 {v31.16b},[PTR(1)],#16 b.hs .Loop5x_xts_dec cmn x2,#0x10 @@ -2925,18 +2929,18 @@ aes_v8_xts_decrypt: .align 4 .Lxts_dec_tail4x: - add x0,x0,#16 + add PTR(0),PTR(0),#16 tst x21,#0xf eor v5.16b,v1.16b,v4.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 eor v17.16b,v24.16b,v17.16b - st1 {v17.16b},[x1],#16 + st1 {v17.16b},[PTR(1)],#16 eor v30.16b,v25.16b,v30.16b eor v31.16b,v26.16b,v31.16b - st1 {v30.16b,v31.16b},[x1],#32 + st1 {v30.16b,v31.16b},[PTR(1)],#32 b.eq .Lxts_dec_abort - ld1 {v0.16b},[x0],#16 + ld1 {v0.16b},[PTR(0)],#16 b .Lxts_done .align 4 .Lxts_outer_dec_tail: @@ -2946,7 +2950,7 @@ aes_v8_xts_decrypt: aesimc v1.16b,v1.16b aesd v24.16b,v16.16b aesimc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aesd v0.16b,v17.16b aesimc v0.16b,v0.16b @@ -2954,7 +2958,7 @@ aes_v8_xts_decrypt: aesimc v1.16b,v1.16b aesd v24.16b,v17.16b aesimc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Lxts_outer_dec_tail aesd v0.16b,v16.16b @@ -2993,9 +2997,9 @@ aes_v8_xts_decrypt: fmov v8.d[1],x10 add x6,x6,#0x20 - add x0,x0,x6 // x0 is adjusted to the last data + add PTR(0),PTR(0),x6 // PTR(0) is adjusted to the last data - mov x7,x3 + mov PTR(7),PTR(3) // The iv for third block extr x22,x10,x10,#32 @@ -3023,19 +3027,19 @@ aes_v8_xts_decrypt: aesimc v1.16b,v1.16b aesd v24.16b,v22.16b aesimc v24.16b,v24.16b - ld1 {v27.16b},[x0],#16 + ld1 {v27.16b},[PTR(0)],#16 aesd v0.16b,v23.16b aesd v1.16b,v23.16b aesd v24.16b,v23.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + ld1 {v16.4s},[PTR(7)],#16 // re-pre-load rndkey[0] add w6,w5,#2 eor v4.16b,v4.16b,v0.16b eor v5.16b,v5.16b,v1.16b eor v24.16b,v24.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v4.16b},[x1],#16 - st1 {v5.16b},[x1],#16 - st1 {v24.16b},[x1],#16 + ld1 {v17.4s},[PTR(7)],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[PTR(1)],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v24.16b},[PTR(1)],#16 cmn x2,#0x30 add x2,x2,#0x30 @@ -3057,13 +3061,13 @@ aes_v8_xts_decrypt: aesimc v1.16b,v1.16b aesd v24.16b,v16.16b aesimc v24.16b,v24.16b - ld1 {v16.4s},[x7],#16 + ld1 {v16.4s},[PTR(7)],#16 subs w6,w6,#2 aesd v1.16b,v17.16b aesimc v1.16b,v1.16b aesd v24.16b,v17.16b aesimc v24.16b,v24.16b - ld1 {v17.4s},[x7],#16 + ld1 {v17.4s},[PTR(7)],#16 b.gt .Lxts_dec_tail_loop aesd v1.16b,v16.16b @@ -3096,8 +3100,8 @@ aes_v8_xts_decrypt: eor v17.16b,v17.16b,v24.16b orr v6.16b,v9.16b,v9.16b orr v8.16b,v10.16b,v10.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 + st1 {v17.16b},[PTR(1)],#16 add x2,x2,#16 b .Lxts_done @@ -3105,86 +3109,86 @@ aes_v8_xts_decrypt: eor v5.16b,v5.16b,v24.16b orr v6.16b,v8.16b,v8.16b orr v8.16b,v9.16b,v9.16b - st1 {v5.16b},[x1],#16 + st1 {v5.16b},[PTR(1)],#16 add x2,x2,#32 .Lxts_done: tst x21,#0xf b.eq .Lxts_dec_abort // Processing the last two blocks with cipher stealing. - mov x7,x3 + mov PTR(7),PTR(3) cbnz x2,.Lxts_dec_1st_done - ld1 {v0.16b},[x0],#16 + ld1 {v0.16b},[PTR(0)],#16 // Decrypt the last secod block to get the last plain text block .Lxts_dec_1st_done: eor v26.16b,v0.16b,v8.16b - ldr w6,[x3,#240] - ld1 {v0.4s},[x3],#16 + ldr w6,[PTR(3),#240] + ld1 {v0.4s},[PTR(3)],#16 sub w6,w6,#2 - ld1 {v1.4s},[x3],#16 + ld1 {v1.4s},[PTR(3)],#16 .Loop_final_2nd_dec: aesd v26.16b,v0.16b aesimc v26.16b,v26.16b - ld1 {v0.4s},[x3],#16 // load key schedule... + ld1 {v0.4s},[PTR(3)],#16 // load key schedule... subs w6,w6,#2 aesd v26.16b,v1.16b aesimc v26.16b,v26.16b - ld1 {v1.4s},[x3],#16 // load key schedule... + ld1 {v1.4s},[PTR(3)],#16 // load key schedule... b.gt .Loop_final_2nd_dec aesd v26.16b,v0.16b aesimc v26.16b,v26.16b - ld1 {v0.4s},[x3] + ld1 {v0.4s},[PTR(3)] aesd v26.16b,v1.16b eor v26.16b,v26.16b,v0.16b eor v26.16b,v26.16b,v8.16b - st1 {v26.16b},[x1] + st1 {v26.16b},[PTR(1)] - mov x20,x0 - add x13,x1,#16 + mov PTR(20),PTR(0) + add PTR(13),PTR(1),#16 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks // to get the last encrypted block. .composite_dec_loop: subs x21,x21,#1 - ldrb w15,[x1,x21] - ldrb w14,[x20,x21] - strb w15,[x13,x21] - strb w14,[x1,x21] + ldrb w15,[PTR(1),x21] + ldrb w14,[PTR(20),x21] + strb w15,[PTR(13),x21] + strb w14,[PTR(1),x21] b.gt .composite_dec_loop .Lxts_dec_load_done: - ld1 {v26.16b},[x1] + ld1 {v26.16b},[PTR(1)] eor v26.16b,v26.16b,v6.16b // Decrypt the composite block to get the last second plain text block - ldr w6,[x7,#240] - ld1 {v0.4s},[x7],#16 + ldr w6,[PTR(7),#240] + ld1 {v0.4s},[PTR(7)],#16 sub w6,w6,#2 - ld1 {v1.4s},[x7],#16 + ld1 {v1.4s},[PTR(7)],#16 .Loop_final_dec: aesd v26.16b,v0.16b aesimc v26.16b,v26.16b - ld1 {v0.4s},[x7],#16 // load key schedule... + ld1 {v0.4s},[PTR(7)],#16 // load key schedule... subs w6,w6,#2 aesd v26.16b,v1.16b aesimc v26.16b,v26.16b - ld1 {v1.4s},[x7],#16 // load key schedule... + ld1 {v1.4s},[PTR(7)],#16 // load key schedule... b.gt .Loop_final_dec aesd v26.16b,v0.16b aesimc v26.16b,v26.16b - ld1 {v0.4s},[x7] + ld1 {v0.4s},[PTR(7)] aesd v26.16b,v1.16b eor v26.16b,v26.16b,v0.16b eor v26.16b,v26.16b,v6.16b - st1 {v26.16b},[x1] + st1 {v26.16b},[PTR(1)] .Lxts_dec_abort: - ldp x21,x22,[sp,#48] - ldp d8,d9,[sp,#32] - ldp d10,d11,[sp,#16] - ldp x19,x20,[sp],#64 + ldp PTR(21),PTR(22),[PTRN(sp),#(2*PTR_WIDTH)] + ldp d8,d9,[PTRN(sp),#(4*PTR_WIDTH)] + ldp d10,d11,[PTRN(sp),#(4*PTR_WIDTH+16)] + ldp PTR(19),PTR(20),[PTRN(sp)],#(4*PTR_WIDTH+32) .Lxts_dec_final_abort: ret diff --git a/sys/crypto/openssl/aarch64/arm64cpuid.S b/sys/crypto/openssl/aarch64/arm64cpuid.S index 52c6ee5b65d3..01adabd302d2 100644 --- a/sys/crypto/openssl/aarch64/arm64cpuid.S +++ b/sys/crypto/openssl/aarch64/arm64cpuid.S @@ -2,7 +2,9 @@ #include "arm_arch.h" .text +#ifndef __CHERI_PURE_CAPABILITY__ .arch armv8-a+crypto +#endif .align 5 .globl _armv7_neon_probe @@ -83,7 +85,7 @@ OPENSSL_cleanse: b.hi .Lot // len>15 nop .Little: - strb wzr,[x0],#1 // store byte-by-byte + strb wzr,[PTR(0)],#1 // store byte-by-byte subs x1,x1,#1 b.ne .Little .Lret: ret @@ -91,13 +93,13 @@ OPENSSL_cleanse: .align 4 .Lot: tst x0,#7 b.eq .Laligned // inp is aligned - strb wzr,[x0],#1 // store byte-by-byte + strb wzr,[PTR(0)],#1 // store byte-by-byte sub x1,x1,#1 b .Lot .align 4 .Laligned: - str xzr,[x0],#8 // store word-by-word + str xzr,[PTR(0)],#8 // store word-by-word sub x1,x1,#8 tst x1,#-8 b.ne .Laligned // len>=8 @@ -114,8 +116,8 @@ CRYPTO_memcmp: cbz x2,.Lno_data // len==0? cmp x2,#16 b.ne .Loop_cmp - ldp x8,x9,[x0] - ldp x10,x11,[x1] + ldp x8,x9,[PTR(0)] + ldp x10,x11,[PTR(1)] eor x8,x8,x10 eor x9,x9,x11 orr x8,x8,x9 @@ -126,8 +128,8 @@ CRYPTO_memcmp: .align 4 .Loop_cmp: - ldrb w4,[x0],#1 - ldrb w5,[x1],#1 + ldrb w4,[PTR(0)],#1 + ldrb w5,[PTR(1)],#1 eor w4,w4,w5 orr w3,w3,w4 subs x2,x2,#1 diff --git a/sys/crypto/openssl/aarch64/armv8-mont.S b/sys/crypto/openssl/aarch64/armv8-mont.S index 8b85fb080aba..1d51bcb12031 100644 --- a/sys/crypto/openssl/aarch64/armv8-mont.S +++ b/sys/crypto/openssl/aarch64/armv8-mont.S @@ -17,8 +17,8 @@ bn_mul_mont: cmp x5,#32 b.le .Lscalar_impl #ifndef __KERNEL__ - adrp x17,OPENSSL_armv8_rsa_neonized - ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] + adrp PTR(17),OPENSSL_armv8_rsa_neonized + ldr w17,[PTR(17),#:lo12:OPENSSL_armv8_rsa_neonized] cbnz w17, bn_mul8x_mont_neon #endif @@ -29,19 +29,28 @@ bn_mul_mont: b.eq __bn_mul4x_mont .Lmul_mont: - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldr x9,[x2],#8 // bp[0] - sub x22,sp,x5,lsl#3 - ldp x7,x8,[x1],#16 // ap[0..1] + stp PTR(29),PTR(30),[PTRN(sp),#-(8*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + + ldr x9,[PTR(2)],#8 // bp[0] +#ifdef __CHERI_PURE_CAPABILITY__ + neg x6,x5 + add PTR(22),csp,x6,lsl#3 +#else + sub PTR(22),sp,x5,lsl#3 +#endif + ldp x7,x8,[PTR(1)],#16 // ap[0..1] lsl x5,x5,#3 - ldr x4,[x4] // *n0 - and x22,x22,#-16 // ABI says so - ldp x13,x14,[x3],#16 // np[0..1] + ldr x4,[PTR(4)] // *n0 +#ifdef __CHERI_PURE_CAPABILITY__ + alignd PTR(22),PTR(22),#4 // ABI says so +#else + and PTR(22),PTR(22),#-16 // ABI says so +#endif + ldp x13,x14,[PTR(3)],#16 // np[0..1] mul x6,x7,x9 // ap[0]*bp[0] sub x21,x5,#16 // j=num-2 @@ -50,7 +59,7 @@ bn_mul_mont: umulh x11,x8,x9 mul x15,x6,x4 // "tp[0]"*n0 - mov sp,x22 // alloca + mov PTRN(sp),PTR(22) // alloca // (*) mul x12,x13,x15 // np[0]*m1 umulh x13,x13,x15 @@ -71,12 +80,12 @@ bn_mul_mont: cbz x21,.L1st_skip .L1st: - ldr x8,[x1],#8 + ldr x8,[PTR(1)],#8 adds x6,x10,x7 sub x21,x21,#8 // j-- adc x7,x11,xzr - ldr x14,[x3],#8 + ldr x14,[PTR(3)],#8 adds x12,x16,x13 mul x10,x8,x9 // ap[j]*bp[0] adc x13,x17,xzr @@ -86,35 +95,49 @@ bn_mul_mont: mul x16,x14,x15 // np[j]*m1 adc x13,x13,xzr umulh x17,x14,x15 - str x12,[x22],#8 // tp[j-1] + str x12,[PTR(22)],#8 // tp[j-1] cbnz x21,.L1st .L1st_skip: +#ifdef __CHERI_PURE_CAPABILITY__ + neg x5,x5 +#endif adds x6,x10,x7 - sub x1,x1,x5 // rewind x1 +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(1),PTR(1),x5 // rewind PTR(1) +#else + sub PTR(1),PTR(1),x5 // rewind PTR(1) +#endif adc x7,x11,xzr adds x12,x16,x13 - sub x3,x3,x5 // rewind x3 +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(3),PTR(3),x5 // rewind PTR(3) +#else + sub PTR(3),PTR(3),x5 // rewind PTR(3) +#endif adc x13,x17,xzr +#ifdef __CHERI_PURE_CAPABILITY__ + neg x5,x5 +#endif adds x12,x12,x6 sub x20,x5,#8 // i=num-1 adcs x13,x13,x7 adc x19,xzr,xzr // upmost overflow bit - stp x12,x13,[x22] + stp x12,x13,[PTR(22)] .Louter: - ldr x9,[x2],#8 // bp[i] - ldp x7,x8,[x1],#16 - ldr x23,[sp] // tp[0] - add x22,sp,#8 + ldr x9,[PTR(2)],#8 // bp[i] + ldp x7,x8,[PTR(1)],#16 + ldr x23,[PTRN(sp)] // tp[0] + add PTR(22),PTRN(sp),#8 mul x6,x7,x9 // ap[0]*bp[i] sub x21,x5,#16 // j=num-2 umulh x7,x7,x9 - ldp x13,x14,[x3],#16 + ldp x13,x14,[PTR(3)],#16 mul x10,x8,x9 // ap[1]*bp[i] adds x6,x6,x23 umulh x11,x8,x9 @@ -132,15 +155,15 @@ bn_mul_mont: cbz x21,.Linner_skip .Linner: - ldr x8,[x1],#8 + ldr x8,[PTR(1)],#8 adc x13,x13,xzr - ldr x23,[x22],#8 // tp[j] + ldr x23,[PTR(22)],#8 // tp[j] adds x6,x10,x7 sub x21,x21,#8 // j-- adc x7,x11,xzr adds x12,x16,x13 - ldr x14,[x3],#8 + ldr x14,[PTR(3)],#8 adc x13,x17,xzr mul x10,x8,x9 // ap[j]*bp[i] @@ -151,20 +174,34 @@ bn_mul_mont: mul x16,x14,x15 // np[j]*m1 adds x12,x12,x6 umulh x17,x14,x15 - stur x12,[x22,#-16] // tp[j-1] + stur x12,[PTR(22),#-16] // tp[j-1] cbnz x21,.Linner .Linner_skip: - ldr x23,[x22],#8 // tp[j] +#ifdef __CHERI_PURE_CAPABILITY__ + neg x5,x5 +#endif + ldr x23,[PTR(22)],#8 // tp[j] adc x13,x13,xzr adds x6,x10,x7 - sub x1,x1,x5 // rewind x1 +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(1),PTR(1),x5 // rewind PTR(1) +#else + sub PTR(1),PTR(1),x5 // rewind PTR(1) +#endif adc x7,x11,xzr adds x12,x16,x13 - sub x3,x3,x5 // rewind x3 +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(3),PTR(3),x5 // rewind PTR(3) +#else + sub PTR(3),PTR(3),x5 // rewind PTR(3) +#endif adcs x13,x17,x19 adc x19,xzr,xzr +#ifdef __CHERI_PURE_CAPABILITY__ + neg x5,x5 +#endif adds x6,x6,x23 adc x7,x7,xzr @@ -172,7 +209,7 @@ bn_mul_mont: adds x12,x12,x6 adcs x13,x13,x7 adc x19,x19,xzr // upmost overflow bit - stp x12,x13,[x22,#-16] + stp x12,x13,[PTR(22),#-16] cbnz x20,.Louter @@ -180,47 +217,47 @@ bn_mul_mont: // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. - ldr x23,[sp] // tp[0] - add x22,sp,#8 - ldr x14,[x3],#8 // np[0] + ldr x23,[PTRN(sp)] // tp[0] + add PTR(22),PTRN(sp),#8 + ldr x14,[PTR(3)],#8 // np[0] subs x21,x5,#8 // j=num-1 and clear borrow - mov x1,x0 + mov PTR(1),PTR(0) .Lsub: sbcs x8,x23,x14 // tp[j]-np[j] - ldr x23,[x22],#8 + ldr x23,[PTR(22)],#8 sub x21,x21,#8 // j-- - ldr x14,[x3],#8 - str x8,[x1],#8 // rp[j]=tp[j]-np[j] + ldr x14,[PTR(3)],#8 + str x8,[PTR(1)],#8 // rp[j]=tp[j]-np[j] cbnz x21,.Lsub sbcs x8,x23,x14 sbcs x19,x19,xzr // did it borrow? - str x8,[x1],#8 // rp[num-1] + str x8,[PTR(1)],#8 // rp[num-1] - ldr x23,[sp] // tp[0] - add x22,sp,#8 - ldr x8,[x0],#8 // rp[0] + ldr x23,[PTRN(sp)] // tp[0] + add PTR(22),PTRN(sp),#8 + ldr x8,[PTR(0)],#8 // rp[0] sub x5,x5,#8 // num-- nop .Lcond_copy: sub x5,x5,#8 // num-- csel x14,x23,x8,lo // did it borrow? - ldr x23,[x22],#8 - ldr x8,[x0],#8 - stur xzr,[x22,#-16] // wipe tp - stur x14,[x0,#-16] + ldr x23,[PTR(22)],#8 + ldr x8,[PTR(0)],#8 + stur xzr,[PTR(22),#-16] // wipe tp + stur x14,[PTR(0),#-16] cbnz x5,.Lcond_copy csel x14,x23,x8,lo - stur xzr,[x22,#-8] // wipe tp - stur x14,[x0,#-8] + stur xzr,[PTR(22),#-8] // wipe tp + stur x14,[PTR(0),#-8] - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + mov PTRN(sp),PTR(29) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] mov x0,#1 - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(8*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size bn_mul_mont,.-bn_mul_mont @@ -229,53 +266,64 @@ bn_mul_mont: bn_mul8x_mont_neon: // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to // only from bn_mul_mont which has already signed the return address. - stp x29,x30,[sp,#-80]! - mov x16,sp - stp d8,d9,[sp,#16] - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH+64)]! + mov PTR(16),PTRN(sp) + stp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] + stp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + stp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + stp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] lsl x5,x5,#1 eor v14.16b,v14.16b,v14.16b +#ifdef __CHERI_PURE_CAPABILITY__ + neg x12,x5 +#endif .align 4 .LNEON_8n: eor v6.16b,v6.16b,v6.16b - sub x7,sp,#128 + sub PTR(7),PTRN(sp),#128 eor v7.16b,v7.16b,v7.16b - sub x7,x7,x5,lsl#4 +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(7),PTR(7),x12,lsl#4 +#else + sub PTR(7),PTR(7),x5,lsl#4 +#endif eor v8.16b,v8.16b,v8.16b - and x7,x7,#-64 +#ifdef __CHERI_PURE_CAPABILITY__ + alignd PTR(7),PTR(7),#6 +#else + and PTR(7),PTR(7),#-64 +#endif eor v9.16b,v9.16b,v9.16b - mov sp,x7 // alloca + mov PTRN(sp),PTR(7) // alloca eor v10.16b,v10.16b,v10.16b - add x7,x7,#256 + add PTR(7),PTR(7),#256 eor v11.16b,v11.16b,v11.16b sub x8,x5,#8 eor v12.16b,v12.16b,v12.16b eor v13.16b,v13.16b,v13.16b .LNEON_8n_init: - st1 {v6.2d,v7.2d},[x7],#32 + st1 {v6.2d,v7.2d},[PTR(7)],#32 subs x8,x8,#8 - st1 {v8.2d,v9.2d},[x7],#32 - st1 {v10.2d,v11.2d},[x7],#32 - st1 {v12.2d,v13.2d},[x7],#32 + st1 {v8.2d,v9.2d},[PTR(7)],#32 + st1 {v10.2d,v11.2d},[PTR(7)],#32 + st1 {v12.2d,v13.2d},[PTR(7)],#32 bne .LNEON_8n_init - add x6,sp,#256 - ld1 {v0.4s,v1.4s},[x1],#32 - add x10,sp,#8 - ldr s30,[x4],#4 + add PTR(6),PTRN(sp),#256 + ld1 {v0.4s,v1.4s},[PTR(1)],#32 + add PTR(10),PTRN(sp),#8 + ldr s30,[PTR(4)],#4 mov x9,x5 b .LNEON_8n_outer .align 4 .LNEON_8n_outer: - ldr s28,[x2],#4 // *b++ + ldr s28,[PTR(2)],#4 // *b++ uxtl v28.4s,v28.4h - add x7,sp,#128 - ld1 {v2.4s,v3.4s},[x3],#32 + add PTR(7),PTRN(sp),#128 + ld1 {v2.4s,v3.4s},[PTR(3)],#32 umlal v6.2d,v28.2s,v0.s[0] umlal v7.2d,v28.2s,v0.s[1] @@ -287,11 +335,11 @@ bn_mul8x_mont_neon: umlal v10.2d,v28.2s,v1.s[0] mul v29.2s,v29.2s,v30.2s umlal v11.2d,v28.2s,v1.s[1] - st1 {v28.2s},[sp] // put aside smashed b[8*i+0] + st1 {v28.2s},[PTRN(sp)] // put aside smashed b[8*i+0] umlal v12.2d,v28.2s,v1.s[2] uxtl v29.4s,v29.4h umlal v13.2d,v28.2s,v1.s[3] - ldr s28,[x2],#4 // *b++ + ldr s28,[PTR(2)],#4 // *b++ umlal v6.2d,v29.2s,v2.s[0] umlal v7.2d,v29.2s,v2.s[1] uxtl v28.4s,v28.4h @@ -307,9 +355,9 @@ bn_mul8x_mont_neon: umlal v13.2d,v29.2s,v3.s[3] add v16.2d,v7.2d,v6.2d ins v7.d[0],v16.d[0] - st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] + st1 {v29.2s},[PTR(10)],#8 // put aside smashed m[8*i+0] umlal v7.2d,v28.2s,v0.s[0] - ld1 {v6.2d},[x6],#16 + ld1 {v6.2d},[PTR(6)],#16 umlal v8.2d,v28.2s,v0.s[1] umlal v9.2d,v28.2s,v0.s[2] shl v29.2d,v7.2d,#16 @@ -319,11 +367,11 @@ bn_mul8x_mont_neon: umlal v11.2d,v28.2s,v1.s[0] mul v29.2s,v29.2s,v30.2s umlal v12.2d,v28.2s,v1.s[1] - st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] + st1 {v28.2s},[PTR(10)],#8 // put aside smashed b[8*i+1] umlal v13.2d,v28.2s,v1.s[2] uxtl v29.4s,v29.4h umlal v6.2d,v28.2s,v1.s[3] - ldr s28,[x2],#4 // *b++ + ldr s28,[PTR(2)],#4 // *b++ umlal v7.2d,v29.2s,v2.s[0] umlal v8.2d,v29.2s,v2.s[1] uxtl v28.4s,v28.4h @@ -339,9 +387,9 @@ bn_mul8x_mont_neon: umlal v6.2d,v29.2s,v3.s[3] add v16.2d,v8.2d,v7.2d ins v8.d[0],v16.d[0] - st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] + st1 {v29.2s},[PTR(10)],#8 // put aside smashed m[8*i+1] umlal v8.2d,v28.2s,v0.s[0] - ld1 {v7.2d},[x6],#16 + ld1 {v7.2d},[PTR(6)],#16 umlal v9.2d,v28.2s,v0.s[1] umlal v10.2d,v28.2s,v0.s[2] shl v29.2d,v8.2d,#16 @@ -351,11 +399,11 @@ bn_mul8x_mont_neon: umlal v12.2d,v28.2s,v1.s[0] mul v29.2s,v29.2s,v30.2s umlal v13.2d,v28.2s,v1.s[1] - st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] + st1 {v28.2s},[PTR(10)],#8 // put aside smashed b[8*i+2] umlal v6.2d,v28.2s,v1.s[2] uxtl v29.4s,v29.4h umlal v7.2d,v28.2s,v1.s[3] - ldr s28,[x2],#4 // *b++ + ldr s28,[PTR(2)],#4 // *b++ umlal v8.2d,v29.2s,v2.s[0] umlal v9.2d,v29.2s,v2.s[1] uxtl v28.4s,v28.4h @@ -371,9 +419,9 @@ bn_mul8x_mont_neon: umlal v7.2d,v29.2s,v3.s[3] add v16.2d,v9.2d,v8.2d ins v9.d[0],v16.d[0] - st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] + st1 {v29.2s},[PTR(10)],#8 // put aside smashed m[8*i+2] umlal v9.2d,v28.2s,v0.s[0] - ld1 {v8.2d},[x6],#16 + ld1 {v8.2d},[PTR(6)],#16 umlal v10.2d,v28.2s,v0.s[1] umlal v11.2d,v28.2s,v0.s[2] shl v29.2d,v9.2d,#16 @@ -383,11 +431,11 @@ bn_mul8x_mont_neon: umlal v13.2d,v28.2s,v1.s[0] mul v29.2s,v29.2s,v30.2s umlal v6.2d,v28.2s,v1.s[1] - st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] + st1 {v28.2s},[PTR(10)],#8 // put aside smashed b[8*i+3] umlal v7.2d,v28.2s,v1.s[2] uxtl v29.4s,v29.4h umlal v8.2d,v28.2s,v1.s[3] - ldr s28,[x2],#4 // *b++ + ldr s28,[PTR(2)],#4 // *b++ umlal v9.2d,v29.2s,v2.s[0] umlal v10.2d,v29.2s,v2.s[1] uxtl v28.4s,v28.4h @@ -403,9 +451,9 @@ bn_mul8x_mont_neon: umlal v8.2d,v29.2s,v3.s[3] add v16.2d,v10.2d,v9.2d ins v10.d[0],v16.d[0] - st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] + st1 {v29.2s},[PTR(10)],#8 // put aside smashed m[8*i+3] umlal v10.2d,v28.2s,v0.s[0] - ld1 {v9.2d},[x6],#16 + ld1 {v9.2d},[PTR(6)],#16 umlal v11.2d,v28.2s,v0.s[1] umlal v12.2d,v28.2s,v0.s[2] shl v29.2d,v10.2d,#16 @@ -415,11 +463,11 @@ bn_mul8x_mont_neon: umlal v6.2d,v28.2s,v1.s[0] mul v29.2s,v29.2s,v30.2s umlal v7.2d,v28.2s,v1.s[1] - st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] + st1 {v28.2s},[PTR(10)],#8 // put aside smashed b[8*i+4] umlal v8.2d,v28.2s,v1.s[2] uxtl v29.4s,v29.4h umlal v9.2d,v28.2s,v1.s[3] - ldr s28,[x2],#4 // *b++ + ldr s28,[PTR(2)],#4 // *b++ umlal v10.2d,v29.2s,v2.s[0] umlal v11.2d,v29.2s,v2.s[1] uxtl v28.4s,v28.4h @@ -435,9 +483,9 @@ bn_mul8x_mont_neon: umlal v9.2d,v29.2s,v3.s[3] add v16.2d,v11.2d,v10.2d ins v11.d[0],v16.d[0] - st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] + st1 {v29.2s},[PTR(10)],#8 // put aside smashed m[8*i+4] umlal v11.2d,v28.2s,v0.s[0] - ld1 {v10.2d},[x6],#16 + ld1 {v10.2d},[PTR(6)],#16 umlal v12.2d,v28.2s,v0.s[1] umlal v13.2d,v28.2s,v0.s[2] shl v29.2d,v11.2d,#16 @@ -447,11 +495,11 @@ bn_mul8x_mont_neon: umlal v7.2d,v28.2s,v1.s[0] mul v29.2s,v29.2s,v30.2s umlal v8.2d,v28.2s,v1.s[1] - st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] + st1 {v28.2s},[PTR(10)],#8 // put aside smashed b[8*i+5] umlal v9.2d,v28.2s,v1.s[2] uxtl v29.4s,v29.4h umlal v10.2d,v28.2s,v1.s[3] - ldr s28,[x2],#4 // *b++ + ldr s28,[PTR(2)],#4 // *b++ umlal v11.2d,v29.2s,v2.s[0] umlal v12.2d,v29.2s,v2.s[1] uxtl v28.4s,v28.4h @@ -467,9 +515,9 @@ bn_mul8x_mont_neon: umlal v10.2d,v29.2s,v3.s[3] add v16.2d,v12.2d,v11.2d ins v12.d[0],v16.d[0] - st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] + st1 {v29.2s},[PTR(10)],#8 // put aside smashed m[8*i+5] umlal v12.2d,v28.2s,v0.s[0] - ld1 {v11.2d},[x6],#16 + ld1 {v11.2d},[PTR(6)],#16 umlal v13.2d,v28.2s,v0.s[1] umlal v6.2d,v28.2s,v0.s[2] shl v29.2d,v12.2d,#16 @@ -479,11 +527,11 @@ bn_mul8x_mont_neon: umlal v8.2d,v28.2s,v1.s[0] mul v29.2s,v29.2s,v30.2s umlal v9.2d,v28.2s,v1.s[1] - st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] + st1 {v28.2s},[PTR(10)],#8 // put aside smashed b[8*i+6] umlal v10.2d,v28.2s,v1.s[2] uxtl v29.4s,v29.4h umlal v11.2d,v28.2s,v1.s[3] - ldr s28,[x2],#4 // *b++ + ldr s28,[PTR(2)],#4 // *b++ umlal v12.2d,v29.2s,v2.s[0] umlal v13.2d,v29.2s,v2.s[1] uxtl v28.4s,v28.4h @@ -499,9 +547,9 @@ bn_mul8x_mont_neon: umlal v11.2d,v29.2s,v3.s[3] add v16.2d,v13.2d,v12.2d ins v13.d[0],v16.d[0] - st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] + st1 {v29.2s},[PTR(10)],#8 // put aside smashed m[8*i+6] umlal v13.2d,v28.2s,v0.s[0] - ld1 {v12.2d},[x6],#16 + ld1 {v12.2d},[PTR(6)],#16 umlal v6.2d,v28.2s,v0.s[1] umlal v7.2d,v28.2s,v0.s[2] shl v29.2d,v13.2d,#16 @@ -511,13 +559,13 @@ bn_mul8x_mont_neon: umlal v9.2d,v28.2s,v1.s[0] mul v29.2s,v29.2s,v30.2s umlal v10.2d,v28.2s,v1.s[1] - st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] + st1 {v28.2s},[PTR(10)],#8 // put aside smashed b[8*i+7] umlal v11.2d,v28.2s,v1.s[2] uxtl v29.4s,v29.4h umlal v12.2d,v28.2s,v1.s[3] - ld1 {v28.2s},[sp] // pull smashed b[8*i+0] + ld1 {v28.2s},[PTRN(sp)] // pull smashed b[8*i+0] umlal v13.2d,v29.2s,v2.s[0] - ld1 {v0.4s,v1.4s},[x1],#32 + ld1 {v0.4s,v1.4s},[PTR(1)],#32 umlal v6.2d,v29.2s,v2.s[1] umlal v7.2d,v29.2s,v2.s[2] mov v5.16b,v13.16b @@ -533,8 +581,8 @@ bn_mul8x_mont_neon: umlal v11.2d,v29.2s,v3.s[2] umlal v12.2d,v29.2s,v3.s[3] add v6.2d,v6.2d,v13.2d - st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] - add x10,sp,#8 // rewind + st1 {v29.2s},[PTR(10)],#8 // put aside smashed m[8*i+7] + add PTR(10),PTRN(sp),#8 // rewind sub x8,x5,#8 b .LNEON_8n_inner @@ -542,20 +590,20 @@ bn_mul8x_mont_neon: .LNEON_8n_inner: subs x8,x8,#8 umlal v6.2d,v28.2s,v0.s[0] - ld1 {v13.2d},[x6] + ld1 {v13.2d},[PTR(6)] umlal v7.2d,v28.2s,v0.s[1] - ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] + ld1 {v29.2s},[PTR(10)],#8 // pull smashed m[8*i+0] umlal v8.2d,v28.2s,v0.s[2] - ld1 {v2.4s,v3.4s},[x3],#32 + ld1 {v2.4s,v3.4s},[PTR(3)],#32 umlal v9.2d,v28.2s,v0.s[3] b.eq .LInner_jump - add x6,x6,#16 // don't advance in last iteration + add PTR(6),PTR(6),#16 // don't advance in last iteration .LInner_jump: umlal v10.2d,v28.2s,v1.s[0] umlal v11.2d,v28.2s,v1.s[1] umlal v12.2d,v28.2s,v1.s[2] umlal v13.2d,v28.2s,v1.s[3] - ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] + ld1 {v28.2s},[PTR(10)],#8 // pull smashed b[8*i+1] umlal v6.2d,v29.2s,v2.s[0] umlal v7.2d,v29.2s,v2.s[1] umlal v8.2d,v29.2s,v2.s[2] @@ -564,21 +612,21 @@ bn_mul8x_mont_neon: umlal v11.2d,v29.2s,v3.s[1] umlal v12.2d,v29.2s,v3.s[2] umlal v13.2d,v29.2s,v3.s[3] - st1 {v6.2d},[x7],#16 + st1 {v6.2d},[PTR(7)],#16 umlal v7.2d,v28.2s,v0.s[0] - ld1 {v6.2d},[x6] + ld1 {v6.2d},[PTR(6)] umlal v8.2d,v28.2s,v0.s[1] - ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] + ld1 {v29.2s},[PTR(10)],#8 // pull smashed m[8*i+1] umlal v9.2d,v28.2s,v0.s[2] b.eq .LInner_jump1 - add x6,x6,#16 // don't advance in last iteration + add PTR(6),PTR(6),#16 // don't advance in last iteration .LInner_jump1: umlal v10.2d,v28.2s,v0.s[3] umlal v11.2d,v28.2s,v1.s[0] umlal v12.2d,v28.2s,v1.s[1] umlal v13.2d,v28.2s,v1.s[2] umlal v6.2d,v28.2s,v1.s[3] - ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] + ld1 {v28.2s},[PTR(10)],#8 // pull smashed b[8*i+2] umlal v7.2d,v29.2s,v2.s[0] umlal v8.2d,v29.2s,v2.s[1] umlal v9.2d,v29.2s,v2.s[2] @@ -587,21 +635,21 @@ bn_mul8x_mont_neon: umlal v12.2d,v29.2s,v3.s[1] umlal v13.2d,v29.2s,v3.s[2] umlal v6.2d,v29.2s,v3.s[3] - st1 {v7.2d},[x7],#16 + st1 {v7.2d},[PTR(7)],#16 umlal v8.2d,v28.2s,v0.s[0] - ld1 {v7.2d},[x6] + ld1 {v7.2d},[PTR(6)] umlal v9.2d,v28.2s,v0.s[1] - ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] + ld1 {v29.2s},[PTR(10)],#8 // pull smashed m[8*i+2] umlal v10.2d,v28.2s,v0.s[2] b.eq .LInner_jump2 - add x6,x6,#16 // don't advance in last iteration + add PTR(6),PTR(6),#16 // don't advance in last iteration .LInner_jump2: umlal v11.2d,v28.2s,v0.s[3] umlal v12.2d,v28.2s,v1.s[0] umlal v13.2d,v28.2s,v1.s[1] umlal v6.2d,v28.2s,v1.s[2] umlal v7.2d,v28.2s,v1.s[3] - ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] + ld1 {v28.2s},[PTR(10)],#8 // pull smashed b[8*i+3] umlal v8.2d,v29.2s,v2.s[0] umlal v9.2d,v29.2s,v2.s[1] umlal v10.2d,v29.2s,v2.s[2] @@ -610,21 +658,21 @@ bn_mul8x_mont_neon: umlal v13.2d,v29.2s,v3.s[1] umlal v6.2d,v29.2s,v3.s[2] umlal v7.2d,v29.2s,v3.s[3] - st1 {v8.2d},[x7],#16 + st1 {v8.2d},[PTR(7)],#16 umlal v9.2d,v28.2s,v0.s[0] - ld1 {v8.2d},[x6] + ld1 {v8.2d},[PTR(6)] umlal v10.2d,v28.2s,v0.s[1] - ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] + ld1 {v29.2s},[PTR(10)],#8 // pull smashed m[8*i+3] umlal v11.2d,v28.2s,v0.s[2] b.eq .LInner_jump3 - add x6,x6,#16 // don't advance in last iteration + add PTR(6),PTR(6),#16 // don't advance in last iteration .LInner_jump3: umlal v12.2d,v28.2s,v0.s[3] umlal v13.2d,v28.2s,v1.s[0] umlal v6.2d,v28.2s,v1.s[1] umlal v7.2d,v28.2s,v1.s[2] umlal v8.2d,v28.2s,v1.s[3] - ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] + ld1 {v28.2s},[PTR(10)],#8 // pull smashed b[8*i+4] umlal v9.2d,v29.2s,v2.s[0] umlal v10.2d,v29.2s,v2.s[1] umlal v11.2d,v29.2s,v2.s[2] @@ -633,21 +681,21 @@ bn_mul8x_mont_neon: umlal v6.2d,v29.2s,v3.s[1] umlal v7.2d,v29.2s,v3.s[2] umlal v8.2d,v29.2s,v3.s[3] - st1 {v9.2d},[x7],#16 + st1 {v9.2d},[PTR(7)],#16 umlal v10.2d,v28.2s,v0.s[0] - ld1 {v9.2d},[x6] + ld1 {v9.2d},[PTR(6)] umlal v11.2d,v28.2s,v0.s[1] - ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] + ld1 {v29.2s},[PTR(10)],#8 // pull smashed m[8*i+4] umlal v12.2d,v28.2s,v0.s[2] b.eq .LInner_jump4 - add x6,x6,#16 // don't advance in last iteration + add PTR(6),PTR(6),#16 // don't advance in last iteration .LInner_jump4: umlal v13.2d,v28.2s,v0.s[3] umlal v6.2d,v28.2s,v1.s[0] umlal v7.2d,v28.2s,v1.s[1] umlal v8.2d,v28.2s,v1.s[2] umlal v9.2d,v28.2s,v1.s[3] - ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] + ld1 {v28.2s},[PTR(10)],#8 // pull smashed b[8*i+5] umlal v10.2d,v29.2s,v2.s[0] umlal v11.2d,v29.2s,v2.s[1] umlal v12.2d,v29.2s,v2.s[2] @@ -656,21 +704,21 @@ bn_mul8x_mont_neon: umlal v7.2d,v29.2s,v3.s[1] umlal v8.2d,v29.2s,v3.s[2] umlal v9.2d,v29.2s,v3.s[3] - st1 {v10.2d},[x7],#16 + st1 {v10.2d},[PTR(7)],#16 umlal v11.2d,v28.2s,v0.s[0] - ld1 {v10.2d},[x6] + ld1 {v10.2d},[PTR(6)] umlal v12.2d,v28.2s,v0.s[1] - ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] + ld1 {v29.2s},[PTR(10)],#8 // pull smashed m[8*i+5] umlal v13.2d,v28.2s,v0.s[2] b.eq .LInner_jump5 - add x6,x6,#16 // don't advance in last iteration + add PTR(6),PTR(6),#16 // don't advance in last iteration .LInner_jump5: umlal v6.2d,v28.2s,v0.s[3] umlal v7.2d,v28.2s,v1.s[0] umlal v8.2d,v28.2s,v1.s[1] umlal v9.2d,v28.2s,v1.s[2] umlal v10.2d,v28.2s,v1.s[3] - ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] + ld1 {v28.2s},[PTR(10)],#8 // pull smashed b[8*i+6] umlal v11.2d,v29.2s,v2.s[0] umlal v12.2d,v29.2s,v2.s[1] umlal v13.2d,v29.2s,v2.s[2] @@ -679,21 +727,21 @@ bn_mul8x_mont_neon: umlal v8.2d,v29.2s,v3.s[1] umlal v9.2d,v29.2s,v3.s[2] umlal v10.2d,v29.2s,v3.s[3] - st1 {v11.2d},[x7],#16 + st1 {v11.2d},[PTR(7)],#16 umlal v12.2d,v28.2s,v0.s[0] - ld1 {v11.2d},[x6] + ld1 {v11.2d},[PTR(6)] umlal v13.2d,v28.2s,v0.s[1] - ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] + ld1 {v29.2s},[PTR(10)],#8 // pull smashed m[8*i+6] umlal v6.2d,v28.2s,v0.s[2] b.eq .LInner_jump6 - add x6,x6,#16 // don't advance in last iteration + add PTR(6),PTR(6),#16 // don't advance in last iteration .LInner_jump6: umlal v7.2d,v28.2s,v0.s[3] umlal v8.2d,v28.2s,v1.s[0] umlal v9.2d,v28.2s,v1.s[1] umlal v10.2d,v28.2s,v1.s[2] umlal v11.2d,v28.2s,v1.s[3] - ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] + ld1 {v28.2s},[PTR(10)],#8 // pull smashed b[8*i+7] umlal v12.2d,v29.2s,v2.s[0] umlal v13.2d,v29.2s,v2.s[1] umlal v6.2d,v29.2s,v2.s[2] @@ -702,14 +750,14 @@ bn_mul8x_mont_neon: umlal v9.2d,v29.2s,v3.s[1] umlal v10.2d,v29.2s,v3.s[2] umlal v11.2d,v29.2s,v3.s[3] - st1 {v12.2d},[x7],#16 + st1 {v12.2d},[PTR(7)],#16 umlal v13.2d,v28.2s,v0.s[0] - ld1 {v12.2d},[x6] + ld1 {v12.2d},[PTR(6)] umlal v6.2d,v28.2s,v0.s[1] - ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] + ld1 {v29.2s},[PTR(10)],#8 // pull smashed m[8*i+7] umlal v7.2d,v28.2s,v0.s[2] b.eq .LInner_jump7 - add x6,x6,#16 // don't advance in last iteration + add PTR(6),PTR(6),#16 // don't advance in last iteration .LInner_jump7: umlal v8.2d,v28.2s,v0.s[3] umlal v9.2d,v28.2s,v1.s[0] @@ -717,51 +765,59 @@ bn_mul8x_mont_neon: umlal v11.2d,v28.2s,v1.s[2] umlal v12.2d,v28.2s,v1.s[3] b.ne .LInner_after_rewind8 - sub x1,x1,x5,lsl#2 // rewind +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(1),PTR(1),x12,lsl#2 // rewind +#else + sub PTR(1),PTR(1),x5,lsl#2 // rewind +#endif .LInner_after_rewind8: umlal v13.2d,v29.2s,v2.s[0] - ld1 {v28.2s},[sp] // pull smashed b[8*i+0] + ld1 {v28.2s},[PTRN(sp)] // pull smashed b[8*i+0] umlal v6.2d,v29.2s,v2.s[1] - ld1 {v0.4s,v1.4s},[x1],#32 + ld1 {v0.4s,v1.4s},[PTR(1)],#32 umlal v7.2d,v29.2s,v2.s[2] - add x10,sp,#8 // rewind + add PTR(10),PTRN(sp),#8 // rewind umlal v8.2d,v29.2s,v2.s[3] umlal v9.2d,v29.2s,v3.s[0] umlal v10.2d,v29.2s,v3.s[1] umlal v11.2d,v29.2s,v3.s[2] - st1 {v13.2d},[x7],#16 + st1 {v13.2d},[PTR(7)],#16 umlal v12.2d,v29.2s,v3.s[3] bne .LNEON_8n_inner - add x6,sp,#128 - st1 {v6.2d,v7.2d},[x7],#32 + add PTR(6),PTRN(sp),#128 + st1 {v6.2d,v7.2d},[PTR(7)],#32 eor v2.16b,v2.16b,v2.16b // v2 - st1 {v8.2d,v9.2d},[x7],#32 + st1 {v8.2d,v9.2d},[PTR(7)],#32 eor v3.16b,v3.16b,v3.16b // v3 - st1 {v10.2d,v11.2d},[x7],#32 - st1 {v12.2d},[x7] + st1 {v10.2d,v11.2d},[PTR(7)],#32 + st1 {v12.2d},[PTR(7)] subs x9,x9,#8 - ld1 {v6.2d,v7.2d},[x6],#32 - ld1 {v8.2d,v9.2d},[x6],#32 - ld1 {v10.2d,v11.2d},[x6],#32 - ld1 {v12.2d,v13.2d},[x6],#32 + ld1 {v6.2d,v7.2d},[PTR(6)],#32 + ld1 {v8.2d,v9.2d},[PTR(6)],#32 + ld1 {v10.2d,v11.2d},[PTR(6)],#32 + ld1 {v12.2d,v13.2d},[PTR(6)],#32 b.eq .LInner_8n_jump_2steps - sub x3,x3,x5,lsl#2 // rewind +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(3),PTR(3),x12,lsl#2 // rewind +#else + sub PTR(3),PTR(3),x5,lsl#2 // rewind +#endif b .LNEON_8n_outer .LInner_8n_jump_2steps: - add x7,sp,#128 - st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame + add PTR(7),PTRN(sp),#128 + st1 {v2.2d,v3.2d}, [PTRN(sp)],#32 // start wiping stack frame mov v5.16b,v6.16b ushr v15.2d,v6.2d,#16 ext v6.16b,v6.16b,v6.16b,#8 - st1 {v2.2d,v3.2d}, [sp],#32 + st1 {v2.2d,v3.2d}, [PTRN(sp)],#32 add v6.2d,v6.2d,v15.2d - st1 {v2.2d,v3.2d}, [sp],#32 + st1 {v2.2d,v3.2d}, [PTRN(sp)],#32 ushr v15.2d,v6.2d,#16 - st1 {v2.2d,v3.2d}, [sp],#32 + st1 {v2.2d,v3.2d}, [PTRN(sp)],#32 zip1 v6.4h,v5.4h,v6.4h ins v15.d[1],v14.d[0] @@ -774,17 +830,17 @@ bn_mul8x_mont_neon: mov v5.16b,v6.16b ushr v15.2d,v6.2d,#16 ext v6.16b,v6.16b,v6.16b,#8 - ld1 {v8.2d,v9.2d}, [x6],#32 + ld1 {v8.2d,v9.2d}, [PTR(6)],#32 add v6.2d,v6.2d,v15.2d - ld1 {v10.2d,v11.2d}, [x6],#32 + ld1 {v10.2d,v11.2d}, [PTR(6)],#32 ushr v15.2d,v6.2d,#16 - ld1 {v12.2d,v13.2d}, [x6],#32 + ld1 {v12.2d,v13.2d}, [PTR(6)],#32 zip1 v6.4h,v5.4h,v6.4h ins v15.d[1],v14.d[0] .LNEON_tail_entry: add v7.2d,v7.2d,v15.2d - st1 {v6.s}[0], [x7],#4 + st1 {v6.s}[0], [PTR(7)],#4 ushr v15.2d,v7.2d,#16 mov v5.16b,v7.16b ext v7.16b,v7.16b,v7.16b,#8 @@ -793,7 +849,7 @@ bn_mul8x_mont_neon: zip1 v7.4h,v5.4h,v7.4h ins v15.d[1],v14.d[0] add v8.2d,v8.2d,v15.2d - st1 {v7.s}[0], [x7],#4 + st1 {v7.s}[0], [PTR(7)],#4 ushr v15.2d,v8.2d,#16 mov v5.16b,v8.16b ext v8.16b,v8.16b,v8.16b,#8 @@ -802,7 +858,7 @@ bn_mul8x_mont_neon: zip1 v8.4h,v5.4h,v8.4h ins v15.d[1],v14.d[0] add v9.2d,v9.2d,v15.2d - st1 {v8.s}[0], [x7],#4 + st1 {v8.s}[0], [PTR(7)],#4 ushr v15.2d,v9.2d,#16 mov v5.16b,v9.16b ext v9.16b,v9.16b,v9.16b,#8 @@ -811,7 +867,7 @@ bn_mul8x_mont_neon: zip1 v9.4h,v5.4h,v9.4h ins v15.d[1],v14.d[0] add v10.2d,v10.2d,v15.2d - st1 {v9.s}[0], [x7],#4 + st1 {v9.s}[0], [PTR(7)],#4 ushr v15.2d,v10.2d,#16 mov v5.16b,v10.16b ext v10.16b,v10.16b,v10.16b,#8 @@ -820,7 +876,7 @@ bn_mul8x_mont_neon: zip1 v10.4h,v5.4h,v10.4h ins v15.d[1],v14.d[0] add v11.2d,v11.2d,v15.2d - st1 {v10.s}[0], [x7],#4 + st1 {v10.s}[0], [PTR(7)],#4 ushr v15.2d,v11.2d,#16 mov v5.16b,v11.16b ext v11.16b,v11.16b,v11.16b,#8 @@ -829,7 +885,7 @@ bn_mul8x_mont_neon: zip1 v11.4h,v5.4h,v11.4h ins v15.d[1],v14.d[0] add v12.2d,v12.2d,v15.2d - st1 {v11.s}[0], [x7],#4 + st1 {v11.s}[0], [PTR(7)],#4 ushr v15.2d,v12.2d,#16 mov v5.16b,v12.16b ext v12.16b,v12.16b,v12.16b,#8 @@ -838,7 +894,7 @@ bn_mul8x_mont_neon: zip1 v12.4h,v5.4h,v12.4h ins v15.d[1],v14.d[0] add v13.2d,v13.2d,v15.2d - st1 {v12.s}[0], [x7],#4 + st1 {v12.s}[0], [PTR(7)],#4 ushr v15.2d,v13.2d,#16 mov v5.16b,v13.16b ext v13.16b,v13.16b,v13.16b,#8 @@ -846,81 +902,92 @@ bn_mul8x_mont_neon: ushr v15.2d,v13.2d,#16 zip1 v13.4h,v5.4h,v13.4h ins v15.d[1],v14.d[0] - ld1 {v6.2d,v7.2d}, [x6],#32 + ld1 {v6.2d,v7.2d}, [PTR(6)],#32 subs x8,x8,#8 - st1 {v13.s}[0], [x7],#4 + st1 {v13.s}[0], [PTR(7)],#4 bne .LNEON_tail - st1 {v15.s}[0], [x7],#4 // top-most bit - sub x3,x3,x5,lsl#2 // rewind x3 - subs x1,sp,#0 // clear carry flag - add x2,sp,x5,lsl#2 + st1 {v15.s}[0], [PTR(7)],#4 // top-most bit +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(3),PTR(3),x12,lsl#2 // rewind PTR(3) + sub PTR(1),PTRN(sp),#0 + cmn x0,xzr // clear carry flag +#else + sub PTR(3),PTR(3),x5,lsl#2 // rewind PTR(3) + subs PTR(1),PTRN(sp),#0 // clear carry flag +#endif + add PTR(2),PTRN(sp),x5,lsl#2 .LNEON_sub: - ldp w4,w5,[x1],#8 - ldp w6,w7,[x1],#8 - ldp w8,w9,[x3],#8 - ldp w10,w11,[x3],#8 + ldp w4,w5,[PTR(1)],#8 + ldp w6,w7,[PTR(1)],#8 + ldp w8,w9,[PTR(3)],#8 + ldp w10,w11,[PTR(3)],#8 sbcs w8,w4,w8 sbcs w9,w5,w9 sbcs w10,w6,w10 sbcs w11,w7,w11 sub x17,x2,x1 - stp w8,w9,[x0],#8 - stp w10,w11,[x0],#8 + stp w8,w9,[PTR(0)],#8 + stp w10,w11,[PTR(0)],#8 cbnz x17,.LNEON_sub - ldr w10, [x1] // load top-most bit + ldr w10, [PTR(1)] // load top-most bit mov x11,sp eor v0.16b,v0.16b,v0.16b sub x11,x2,x11 // this is num*4 eor v1.16b,v1.16b,v1.16b - mov x1,sp - sub x0,x0,x11 // rewind x0 - mov x3,x2 // second 3/4th of frame + mov PTR(1),PTRN(sp) +#ifdef __CHERI_PURE_CAPABILITY__ + neg x11,x11 + add PTR(0),PTR(0),x11 // rewind PTR(0) +#else + sub PTR(0),PTR(0),x11 // rewind PTR(0) +#endif + mov PTR(3),PTR(2) // second 3/4th of frame sbcs w10,w10,wzr // result is carry flag .LNEON_copy_n_zap: - ldp w4,w5,[x1],#8 - ldp w6,w7,[x1],#8 - ldp w8,w9,[x0],#8 - ldp w10,w11,[x0] - sub x0,x0,#8 + ldp w4,w5,[PTR(1)],#8 + ldp w6,w7,[PTR(1)],#8 + ldp w8,w9,[PTR(0)],#8 + ldp w10,w11,[PTR(0)] + sub PTR(0),PTR(0),#8 b.cs .LCopy_1 mov w8,w4 mov w9,w5 mov w10,w6 mov w11,w7 .LCopy_1: - st1 {v0.2d,v1.2d}, [x3],#32 // wipe - st1 {v0.2d,v1.2d}, [x3],#32 // wipe - ldp w4,w5,[x1],#8 - ldp w6,w7,[x1],#8 - stp w8,w9,[x0],#8 - stp w10,w11,[x0],#8 - sub x1,x1,#32 - ldp w8,w9,[x0],#8 - ldp w10,w11,[x0] - sub x0,x0,#8 + st1 {v0.2d,v1.2d}, [PTR(3)],#32 // wipe + st1 {v0.2d,v1.2d}, [PTR(3)],#32 // wipe + ldp w4,w5,[PTR(1)],#8 + ldp w6,w7,[PTR(1)],#8 + stp w8,w9,[PTR(0)],#8 + stp w10,w11,[PTR(0)],#8 + sub PTR(1),PTR(1),#32 + ldp w8,w9,[PTR(0)],#8 + ldp w10,w11,[PTR(0)] + sub PTR(0),PTR(0),#8 b.cs .LCopy_2 mov w8, w4 mov w9, w5 mov w10, w6 mov w11, w7 .LCopy_2: - st1 {v0.2d,v1.2d}, [x1],#32 // wipe - st1 {v0.2d,v1.2d}, [x3],#32 // wipe + st1 {v0.2d,v1.2d}, [PTR(1)],#32 // wipe + st1 {v0.2d,v1.2d}, [PTR(3)],#32 // wipe sub x17,x2,x1 // preserves carry - stp w8,w9,[x0],#8 - stp w10,w11,[x0],#8 + stp w8,w9,[PTR(0)],#8 + stp w10,w11,[PTR(0)],#8 cbnz x17,.LNEON_copy_n_zap - mov sp,x16 - ldp d14,d15,[sp,#64] - ldp d12,d13,[sp,#48] - ldp d10,d11,[sp,#32] - ldp d8,d9,[sp,#16] - ldr x29,[sp],#80 + mov PTRN(sp),PTR(16) + ldp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] + ldp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + ldp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + ldp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH+64) AARCH64_VALIDATE_LINK_REGISTER ret // bx lr @@ -928,48 +995,53 @@ bn_mul8x_mont_neon: .type __bn_sqr8x_mont,%function .align 5 __bn_sqr8x_mont: - cmp x1,x2 + cmp PTR(1),PTR(2) b.ne __bn_mul4x_mont .Lsqr8x_mont: // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to // only from bn_mul_mont which has already signed the return address. - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x0,x3,[sp,#96] // offload rp and np - - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - ldp x10,x11,[x1,#8*4] - ldp x12,x13,[x1,#8*6] - - sub x2,sp,x5,lsl#4 + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + stp PTR(0),PTR(3),[PTRN(sp),#(12*PTR_WIDTH)] // offload rp and np + + ldp x6,x7,[PTR(1),#8*0] + ldp x8,x9,[PTR(1),#8*2] + ldp x10,x11,[PTR(1),#8*4] + ldp x12,x13,[PTR(1),#8*6] + +#ifdef __CHERI_PURE_CAPABILITY__ + neg x14,x5 + add PTR(2),csp,x14,lsl#4 +#else + sub PTR(2),sp,x5,lsl#4 +#endif lsl x5,x5,#3 - ldr x4,[x4] // *n0 - mov sp,x2 // alloca + ldr x4,[PTR(4)] // *n0 + mov PTRN(sp),PTR(2) // alloca sub x27,x5,#8*8 b .Lsqr8x_zero_start .Lsqr8x_zero: sub x27,x27,#8*8 - stp xzr,xzr,[x2,#8*0] - stp xzr,xzr,[x2,#8*2] - stp xzr,xzr,[x2,#8*4] - stp xzr,xzr,[x2,#8*6] + stp xzr,xzr,[PTR(2),#8*0] + stp xzr,xzr,[PTR(2),#8*2] + stp xzr,xzr,[PTR(2),#8*4] + stp xzr,xzr,[PTR(2),#8*6] .Lsqr8x_zero_start: - stp xzr,xzr,[x2,#8*8] - stp xzr,xzr,[x2,#8*10] - stp xzr,xzr,[x2,#8*12] - stp xzr,xzr,[x2,#8*14] - add x2,x2,#8*16 + stp xzr,xzr,[PTR(2),#8*8] + stp xzr,xzr,[PTR(2),#8*10] + stp xzr,xzr,[PTR(2),#8*12] + stp xzr,xzr,[PTR(2),#8*14] + add PTR(2),PTR(2),#8*16 cbnz x27,.Lsqr8x_zero - add x3,x1,x5 - add x1,x1,#8*8 + add PTR(3),PTR(1),x5 + add PTR(1),PTR(1),#8*8 mov x19,xzr mov x20,xzr mov x21,xzr @@ -978,8 +1050,8 @@ __bn_sqr8x_mont: mov x24,xzr mov x25,xzr mov x26,xzr - mov x2,sp - str x4,[x29,#112] // offload n0 + mov PTR(2),PTRN(sp) + str x4,[PTR(29),#(14*PTR_WIDTH)] // offload n0 // Multiply everything but a[i]*a[i] .align 4 @@ -1031,7 +1103,7 @@ __bn_sqr8x_mont: umulh x15,x9,x6 adcs x26,x26,x16 umulh x16,x10,x6 - stp x19,x20,[x2],#8*2 // t[0..1] + stp x19,x20,[PTR(2)],#8*2 // t[0..1] adc x19,xzr,xzr // t[8] adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) umulh x17,x11,x6 @@ -1060,7 +1132,7 @@ __bn_sqr8x_mont: umulh x16,x10,x7 adcs x19,x19,x17 umulh x17,x11,x7 - stp x21,x22,[x2],#8*2 // t[2..3] + stp x21,x22,[PTR(2)],#8*2 // t[2..3] adc x20,xzr,xzr // t[9] adds x23,x23,x14 umulh x14,x12,x7 @@ -1085,7 +1157,7 @@ __bn_sqr8x_mont: umulh x15,x11,x8 adcs x20,x20,x16 umulh x16,x12,x8 - stp x23,x24,[x2],#8*2 // t[4..5] + stp x23,x24,[PTR(2)],#8*2 // t[4..5] adc x21,xzr,xzr // t[10] adds x25,x25,x17 umulh x17,x13,x8 @@ -1106,7 +1178,7 @@ __bn_sqr8x_mont: umulh x16,x12,x9 adcs x21,x21,x17 umulh x17,x13,x9 - stp x25,x26,[x2],#8*2 // t[6..7] + stp x25,x26,[PTR(2)],#8*2 // t[6..7] adc x22,xzr,xzr // t[11] adds x19,x19,x14 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) @@ -1141,31 +1213,36 @@ __bn_sqr8x_mont: adc x24,x24,x15 adds x24,x24,x16 - sub x14,x3,x5 // rewinded ap +#ifdef __CHERI_PURE_CAPABILITY__ + neg x14,x5 + add PTR(14),PTR(3),x14 // rewinded ap +#else + sub PTR(14),PTR(3),x5 // rewinded ap +#endif adc x25,xzr,xzr // t[14] add x25,x25,x17 cbz x27,.Lsqr8x_outer_break mov x4,x6 - ldp x6,x7,[x2,#8*0] - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] + ldp x6,x7,[PTR(2),#8*0] + ldp x8,x9,[PTR(2),#8*2] + ldp x10,x11,[PTR(2),#8*4] + ldp x12,x13,[PTR(2),#8*6] adds x19,x19,x6 adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] + ldp x6,x7,[PTR(1),#8*0] adcs x21,x21,x8 adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] + ldp x8,x9,[PTR(1),#8*2] adcs x23,x23,x10 adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] + ldp x10,x11,[PTR(1),#8*4] adcs x25,x25,x12 - mov x0,x1 + mov PTR(0),PTR(1) adcs x26,xzr,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 + ldp x12,x13,[PTR(1),#8*6] + add PTR(1),PTR(1),#8*8 //adc x28,xzr,xzr // moved below mov x27,#-8*8 @@ -1215,7 +1292,7 @@ __bn_sqr8x_mont: adcs x26,x26,x17 umulh x17,x9,x4 adc x28,x28,xzr - str x19,[x2],#8 + str x19,[PTR(2)],#8 adds x19,x20,x14 umulh x14,x10,x4 adcs x20,x21,x15 @@ -1224,7 +1301,7 @@ __bn_sqr8x_mont: umulh x16,x12,x4 adcs x22,x23,x17 umulh x17,x13,x4 - ldr x4,[x0,x27] + ldr x4,[PTR(0),x27] adcs x23,x24,x14 adcs x24,x25,x15 adcs x25,x26,x16 @@ -1233,70 +1310,75 @@ __bn_sqr8x_mont: cbnz x27,.Lsqr8x_mul // note that carry flag is guaranteed // to be zero at this point - cmp x1,x3 // done yet? + cmp PTR(1),PTR(3) // done yet? b.eq .Lsqr8x_break - ldp x6,x7,[x2,#8*0] - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] + ldp x6,x7,[PTR(2),#8*0] + ldp x8,x9,[PTR(2),#8*2] + ldp x10,x11,[PTR(2),#8*4] + ldp x12,x13,[PTR(2),#8*6] adds x19,x19,x6 - ldur x4,[x0,#-8*8] + ldur x4,[PTR(0),#-8*8] adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] + ldp x6,x7,[PTR(1),#8*0] adcs x21,x21,x8 adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] + ldp x8,x9,[PTR(1),#8*2] adcs x23,x23,x10 adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] + ldp x10,x11,[PTR(1),#8*4] adcs x25,x25,x12 mov x27,#-8*8 adcs x26,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 + ldp x12,x13,[PTR(1),#8*6] + add PTR(1),PTR(1),#8*8 //adc x28,xzr,xzr // moved above b .Lsqr8x_mul .align 4 .Lsqr8x_break: - ldp x6,x7,[x0,#8*0] - add x1,x0,#8*8 - ldp x8,x9,[x0,#8*2] - sub x14,x3,x1 // is it last iteration? - ldp x10,x11,[x0,#8*4] - sub x15,x2,x14 - ldp x12,x13,[x0,#8*6] + ldp x6,x7,[PTR(0),#8*0] + add PTR(1),PTR(0),#8*8 + ldp x8,x9,[PTR(0),#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[PTR(0),#8*4] +#ifdef __CHERI_PURE_CAPABILITY__ + neg x14,x14 + add PTR(15),PTR(2),x14 +#else + sub PTR(15),PTR(2),x14 +#endif + ldp x12,x13,[PTR(0),#8*6] cbz x14,.Lsqr8x_outer_loop - stp x19,x20,[x2,#8*0] - ldp x19,x20,[x15,#8*0] - stp x21,x22,[x2,#8*2] - ldp x21,x22,[x15,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[x15,#8*4] - stp x25,x26,[x2,#8*6] - mov x2,x15 - ldp x25,x26,[x15,#8*6] + stp x19,x20,[PTR(2),#8*0] + ldp x19,x20,[PTR(15),#8*0] + stp x21,x22,[PTR(2),#8*2] + ldp x21,x22,[PTR(15),#8*2] + stp x23,x24,[PTR(2),#8*4] + ldp x23,x24,[PTR(15),#8*4] + stp x25,x26,[PTR(2),#8*6] + mov PTR(2),PTR(15) + ldp x25,x26,[PTR(15),#8*6] b .Lsqr8x_outer_loop .align 4 .Lsqr8x_outer_break: // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] - ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] - ldp x15,x16,[sp,#8*1] - ldp x11,x13,[x14,#8*2] - add x1,x14,#8*4 - ldp x17,x14,[sp,#8*3] + ldp x7,x9,[PTR(14),#8*0] // recall that x14 is &a[0] + ldp x15,x16,[PTRN(sp),#8*1] + ldp x11,x13,[PTR(14),#8*2] + add PTR(1),PTR(14),#8*4 + ldp x17,x14,[PTRN(sp),#8*3] - stp x19,x20,[x2,#8*0] + stp x19,x20,[PTR(2),#8*0] mul x19,x7,x7 - stp x21,x22,[x2,#8*2] + stp x21,x22,[PTR(2),#8*2] umulh x7,x7,x7 - stp x23,x24,[x2,#8*4] + stp x23,x24,[PTR(2),#8*4] mul x8,x9,x9 - stp x25,x26,[x2,#8*6] - mov x2,sp + stp x25,x26,[PTR(2),#8*6] + mov PTR(2),PTRN(sp) umulh x9,x9,x9 adds x20,x7,x15,lsl#1 extr x15,x16,x15,#63 @@ -1307,76 +1389,81 @@ __bn_sqr8x_mont: extr x16,x17,x16,#63 sub x27,x27,#8*4 adcs x22,x9,x16 - ldp x15,x16,[x2,#8*5] + ldp x15,x16,[PTR(2),#8*5] mul x10,x11,x11 - ldp x7,x9,[x1],#8*2 + ldp x7,x9,[PTR(1)],#8*2 umulh x11,x11,x11 mul x12,x13,x13 umulh x13,x13,x13 extr x17,x14,x17,#63 - stp x19,x20,[x2,#8*0] + stp x19,x20,[PTR(2),#8*0] adcs x23,x10,x17 extr x14,x15,x14,#63 - stp x21,x22,[x2,#8*2] + stp x21,x22,[PTR(2),#8*2] adcs x24,x11,x14 - ldp x17,x14,[x2,#8*7] + ldp x17,x14,[PTR(2),#8*7] extr x15,x16,x15,#63 adcs x25,x12,x15 extr x16,x17,x16,#63 adcs x26,x13,x16 - ldp x15,x16,[x2,#8*9] + ldp x15,x16,[PTR(2),#8*9] mul x6,x7,x7 - ldp x11,x13,[x1],#8*2 + ldp x11,x13,[PTR(1)],#8*2 umulh x7,x7,x7 mul x8,x9,x9 umulh x9,x9,x9 - stp x23,x24,[x2,#8*4] + stp x23,x24,[PTR(2),#8*4] extr x17,x14,x17,#63 - stp x25,x26,[x2,#8*6] - add x2,x2,#8*8 + stp x25,x26,[PTR(2),#8*6] + add PTR(2),PTR(2),#8*8 adcs x19,x6,x17 extr x14,x15,x14,#63 adcs x20,x7,x14 - ldp x17,x14,[x2,#8*3] + ldp x17,x14,[PTR(2),#8*3] extr x15,x16,x15,#63 cbnz x27,.Lsqr4x_shift_n_add - ldp x1,x4,[x29,#104] // pull np and n0 +#ifdef __CHERI_PURE_CAPABILITY__ + ldr PTR(1),[PTR(29),#(13*PTR_WIDTH)] // pull np + ldr x4,[PTR(29),#(14*PTR_WIDTH)] // pull n0 +#else + ldp PTR(1),x4,[PTR(29),#(13*PTR_WIDTH)] // pull np and n0 +#endif adcs x21,x8,x15 extr x16,x17,x16,#63 adcs x22,x9,x16 - ldp x15,x16,[x2,#8*5] + ldp x15,x16,[PTR(2),#8*5] mul x10,x11,x11 umulh x11,x11,x11 - stp x19,x20,[x2,#8*0] + stp x19,x20,[PTR(2),#8*0] mul x12,x13,x13 umulh x13,x13,x13 - stp x21,x22,[x2,#8*2] + stp x21,x22,[PTR(2),#8*2] extr x17,x14,x17,#63 adcs x23,x10,x17 extr x14,x15,x14,#63 - ldp x19,x20,[sp,#8*0] + ldp x19,x20,[PTRN(sp),#8*0] adcs x24,x11,x14 extr x15,x16,x15,#63 - ldp x6,x7,[x1,#8*0] + ldp x6,x7,[PTR(1),#8*0] adcs x25,x12,x15 extr x16,xzr,x16,#63 - ldp x8,x9,[x1,#8*2] + ldp x8,x9,[PTR(1),#8*2] adc x26,x13,x16 - ldp x10,x11,[x1,#8*4] + ldp x10,x11,[PTR(1),#8*4] // Reduce by 512 bits per iteration mul x28,x4,x19 // t[0]*n0 - ldp x12,x13,[x1,#8*6] - add x3,x1,x5 - ldp x21,x22,[sp,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[sp,#8*4] - stp x25,x26,[x2,#8*6] - ldp x25,x26,[sp,#8*6] - add x1,x1,#8*8 + ldp x12,x13,[PTR(1),#8*6] + add PTR(3),PTR(1),x5 + ldp x21,x22,[PTRN(sp),#8*2] + stp x23,x24,[PTR(2),#8*4] + ldp x23,x24,[PTRN(sp),#8*4] + stp x25,x26,[PTR(2),#8*6] + ldp x25,x26,[PTRN(sp),#8*6] + add PTR(1),PTR(1),#8*8 mov x30,xzr // initial top-most carry - mov x2,sp + mov PTR(2),PTRN(sp) mov x27,#8 .Lsqr8x_reduction: @@ -1384,7 +1471,7 @@ __bn_sqr8x_mont: mul x15,x7,x28 sub x27,x27,#1 mul x16,x8,x28 - str x28,[x2],#8 // put aside t[0]*n0 for tail processing + str x28,[PTR(2)],#8 // put aside t[0]*n0 for tail processing mul x17,x9,x28 // (*) adds xzr,x19,x14 subs xzr,x19,#1 // (*) @@ -1419,16 +1506,16 @@ __bn_sqr8x_mont: adc x26,x26,x17 cbnz x27,.Lsqr8x_reduction - ldp x14,x15,[x2,#8*0] - ldp x16,x17,[x2,#8*2] - mov x0,x2 + ldp x14,x15,[PTR(2),#8*0] + ldp x16,x17,[PTR(2),#8*2] + mov PTR(0),PTR(2) sub x27,x3,x1 // done yet? adds x19,x19,x14 adcs x20,x20,x15 - ldp x14,x15,[x2,#8*4] + ldp x14,x15,[PTR(2),#8*4] adcs x21,x21,x16 adcs x22,x22,x17 - ldp x16,x17,[x2,#8*6] + ldp x16,x17,[PTR(2),#8*6] adcs x23,x23,x14 adcs x24,x24,x15 adcs x25,x25,x16 @@ -1436,13 +1523,13 @@ __bn_sqr8x_mont: //adc x28,xzr,xzr // moved below cbz x27,.Lsqr8x8_post_condition - ldur x4,[x2,#-8*8] - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - ldp x10,x11,[x1,#8*4] + ldur x4,[PTR(2),#-8*8] + ldp x6,x7,[PTR(1),#8*0] + ldp x8,x9,[PTR(1),#8*2] + ldp x10,x11,[PTR(1),#8*4] mov x27,#-8*8 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 + ldp x12,x13,[PTR(1),#8*6] + add PTR(1),PTR(1),#8*8 .Lsqr8x_tail: mul x14,x6,x4 @@ -1468,7 +1555,7 @@ __bn_sqr8x_mont: adcs x26,x26,x17 umulh x17,x9,x4 adc x28,x28,xzr - str x19,[x2],#8 + str x19,[PTR(2)],#8 adds x19,x20,x14 umulh x14,x10,x4 adcs x20,x21,x15 @@ -1477,7 +1564,7 @@ __bn_sqr8x_mont: umulh x16,x12,x4 adcs x22,x23,x17 umulh x17,x13,x4 - ldr x4,[x0,x27] + ldr x4,[PTR(0),x27] adcs x23,x24,x14 adcs x24,x25,x15 adcs x25,x26,x16 @@ -1486,63 +1573,68 @@ __bn_sqr8x_mont: cbnz x27,.Lsqr8x_tail // note that carry flag is guaranteed // to be zero at this point - ldp x6,x7,[x2,#8*0] + ldp x6,x7,[PTR(2),#8*0] sub x27,x3,x1 // done yet? - sub x16,x3,x5 // rewinded np - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] +#ifdef __CHERI_PURE_CAPABILITY__ + neg x16,x5 + add PTR(16),PTR(3),x16 // rewinded np +#else + sub PTR(16),PTR(3),x5 // rewinded np +#endif + ldp x8,x9,[PTR(2),#8*2] + ldp x10,x11,[PTR(2),#8*4] + ldp x12,x13,[PTR(2),#8*6] cbz x27,.Lsqr8x_tail_break - ldur x4,[x0,#-8*8] + ldur x4,[PTR(0),#-8*8] adds x19,x19,x6 adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] + ldp x6,x7,[PTR(1),#8*0] adcs x21,x21,x8 adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] + ldp x8,x9,[PTR(1),#8*2] adcs x23,x23,x10 adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] + ldp x10,x11,[PTR(1),#8*4] adcs x25,x25,x12 mov x27,#-8*8 adcs x26,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 + ldp x12,x13,[PTR(1),#8*6] + add PTR(1),PTR(1),#8*8 //adc x28,xzr,xzr // moved above b .Lsqr8x_tail .align 4 .Lsqr8x_tail_break: - ldr x4,[x29,#112] // pull n0 + ldr x4,[PTR(29),#(14*PTR_WIDTH)] // pull n0 add x27,x2,#8*8 // end of current t[num] window subs xzr,x30,#1 // "move" top-most carry to carry bit adcs x14,x19,x6 adcs x15,x20,x7 - ldp x19,x20,[x0,#8*0] + ldp x19,x20,[PTR(0),#8*0] adcs x21,x21,x8 - ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + ldp x6,x7,[PTR(16),#8*0] // recall that x16 is &n[0] adcs x22,x22,x9 - ldp x8,x9,[x16,#8*2] + ldp x8,x9,[PTR(16),#8*2] adcs x23,x23,x10 adcs x24,x24,x11 - ldp x10,x11,[x16,#8*4] + ldp x10,x11,[PTR(16),#8*4] adcs x25,x25,x12 adcs x26,x26,x13 - ldp x12,x13,[x16,#8*6] - add x1,x16,#8*8 + ldp x12,x13,[PTR(16),#8*6] + add PTR(1),PTR(16),#8*8 adc x30,xzr,xzr // top-most carry mul x28,x4,x19 - stp x14,x15,[x2,#8*0] - stp x21,x22,[x2,#8*2] - ldp x21,x22,[x0,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[x0,#8*4] + stp x14,x15,[PTR(2),#8*0] + stp x21,x22,[PTR(2),#8*2] + ldp x21,x22,[PTR(0),#8*2] + stp x23,x24,[PTR(2),#8*4] + ldp x23,x24,[PTR(0),#8*4] cmp x27,x29 // did we hit the bottom? - stp x25,x26,[x2,#8*6] - mov x2,x0 // slide the window - ldp x25,x26,[x0,#8*6] + stp x25,x26,[PTR(2),#8*6] + mov PTR(2),PTR(0) // slide the window + ldp x25,x26,[PTR(0),#8*6] mov x27,#8 b.ne .Lsqr8x_reduction @@ -1550,139 +1642,139 @@ __bn_sqr8x_mont: // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. - ldr x0,[x29,#96] // pull rp - add x2,x2,#8*8 + ldr PTR(0),[PTR(29),#(12*PTR_WIDTH)] // pull rp + add PTR(2),PTR(2),#8*8 subs x14,x19,x6 sbcs x15,x20,x7 sub x27,x5,#8*8 - mov x3,x0 // x0 copy + mov PTR(3),PTR(0) // PTR(0) copy .Lsqr8x_sub: sbcs x16,x21,x8 - ldp x6,x7,[x1,#8*0] + ldp x6,x7,[PTR(1),#8*0] sbcs x17,x22,x9 - stp x14,x15,[x0,#8*0] + stp x14,x15,[PTR(0),#8*0] sbcs x14,x23,x10 - ldp x8,x9,[x1,#8*2] + ldp x8,x9,[PTR(1),#8*2] sbcs x15,x24,x11 - stp x16,x17,[x0,#8*2] + stp x16,x17,[PTR(0),#8*2] sbcs x16,x25,x12 - ldp x10,x11,[x1,#8*4] + ldp x10,x11,[PTR(1),#8*4] sbcs x17,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - ldp x19,x20,[x2,#8*0] + ldp x12,x13,[PTR(1),#8*6] + add PTR(1),PTR(1),#8*8 + ldp x19,x20,[PTR(2),#8*0] sub x27,x27,#8*8 - ldp x21,x22,[x2,#8*2] - ldp x23,x24,[x2,#8*4] - ldp x25,x26,[x2,#8*6] - add x2,x2,#8*8 - stp x14,x15,[x0,#8*4] + ldp x21,x22,[PTR(2),#8*2] + ldp x23,x24,[PTR(2),#8*4] + ldp x25,x26,[PTR(2),#8*6] + add PTR(2),PTR(2),#8*8 + stp x14,x15,[PTR(0),#8*4] sbcs x14,x19,x6 - stp x16,x17,[x0,#8*6] - add x0,x0,#8*8 + stp x16,x17,[PTR(0),#8*6] + add PTR(0),PTR(0),#8*8 sbcs x15,x20,x7 cbnz x27,.Lsqr8x_sub sbcs x16,x21,x8 - mov x2,sp - add x1,sp,x5 - ldp x6,x7,[x3,#8*0] + mov PTR(2),PTRN(sp) + add PTR(1),PTRN(sp),x5 + ldp x6,x7,[PTR(3),#8*0] sbcs x17,x22,x9 - stp x14,x15,[x0,#8*0] + stp x14,x15,[PTR(0),#8*0] sbcs x14,x23,x10 - ldp x8,x9,[x3,#8*2] + ldp x8,x9,[PTR(3),#8*2] sbcs x15,x24,x11 - stp x16,x17,[x0,#8*2] + stp x16,x17,[PTR(0),#8*2] sbcs x16,x25,x12 - ldp x19,x20,[x1,#8*0] + ldp x19,x20,[PTR(1),#8*0] sbcs x17,x26,x13 - ldp x21,x22,[x1,#8*2] + ldp x21,x22,[PTR(1),#8*2] sbcs xzr,x30,xzr // did it borrow? - ldr x30,[x29,#8] // pull return address - stp x14,x15,[x0,#8*4] - stp x16,x17,[x0,#8*6] + ldr PTR(30),[PTR(29),#PTR_WIDTH] // pull return address + stp x14,x15,[PTR(0),#8*4] + stp x16,x17,[PTR(0),#8*6] sub x27,x5,#8*4 .Lsqr4x_cond_copy: sub x27,x27,#8*4 csel x14,x19,x6,lo - stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[PTR(2),#8*0] csel x15,x20,x7,lo - ldp x6,x7,[x3,#8*4] - ldp x19,x20,[x1,#8*4] + ldp x6,x7,[PTR(3),#8*4] + ldp x19,x20,[PTR(1),#8*4] csel x16,x21,x8,lo - stp xzr,xzr,[x2,#8*2] - add x2,x2,#8*4 + stp xzr,xzr,[PTR(2),#8*2] + add PTR(2),PTR(2),#8*4 csel x17,x22,x9,lo - ldp x8,x9,[x3,#8*6] - ldp x21,x22,[x1,#8*6] - add x1,x1,#8*4 - stp x14,x15,[x3,#8*0] - stp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - stp xzr,xzr,[x1,#8*0] - stp xzr,xzr,[x1,#8*2] + ldp x8,x9,[PTR(3),#8*6] + ldp x21,x22,[PTR(1),#8*6] + add PTR(1),PTR(1),#8*4 + stp x14,x15,[PTR(3),#8*0] + stp x16,x17,[PTR(3),#8*2] + add PTR(3),PTR(3),#8*4 + stp xzr,xzr,[PTR(1),#8*0] + stp xzr,xzr,[PTR(1),#8*2] cbnz x27,.Lsqr4x_cond_copy csel x14,x19,x6,lo - stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[PTR(2),#8*0] csel x15,x20,x7,lo - stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[PTR(2),#8*2] csel x16,x21,x8,lo csel x17,x22,x9,lo - stp x14,x15,[x3,#8*0] - stp x16,x17,[x3,#8*2] + stp x14,x15,[PTR(3),#8*0] + stp x16,x17,[PTR(3),#8*2] b .Lsqr8x_done .align 4 .Lsqr8x8_post_condition: adc x28,xzr,xzr - ldr x30,[x29,#8] // pull return address + ldr PTR(30),[PTR(29),#PTR_WIDTH] // pull return address // x19-7,x28 hold result, x6-7 hold modulus subs x6,x19,x6 - ldr x1,[x29,#96] // pull rp + ldr PTR(1),[PTR(29),#(12*PTR_WIDTH)] // pull rp sbcs x7,x20,x7 - stp xzr,xzr,[sp,#8*0] + stp xzr,xzr,[PTRN(sp),#8*0] sbcs x8,x21,x8 - stp xzr,xzr,[sp,#8*2] + stp xzr,xzr,[PTRN(sp),#8*2] sbcs x9,x22,x9 - stp xzr,xzr,[sp,#8*4] + stp xzr,xzr,[PTRN(sp),#8*4] sbcs x10,x23,x10 - stp xzr,xzr,[sp,#8*6] + stp xzr,xzr,[PTRN(sp),#8*6] sbcs x11,x24,x11 - stp xzr,xzr,[sp,#8*8] + stp xzr,xzr,[PTRN(sp),#8*8] sbcs x12,x25,x12 - stp xzr,xzr,[sp,#8*10] + stp xzr,xzr,[PTRN(sp),#8*10] sbcs x13,x26,x13 - stp xzr,xzr,[sp,#8*12] + stp xzr,xzr,[PTRN(sp),#8*12] sbcs x28,x28,xzr // did it borrow? - stp xzr,xzr,[sp,#8*14] + stp xzr,xzr,[PTRN(sp),#8*14] // x6-7 hold result-modulus csel x6,x19,x6,lo csel x7,x20,x7,lo csel x8,x21,x8,lo csel x9,x22,x9,lo - stp x6,x7,[x1,#8*0] + stp x6,x7,[PTR(1),#8*0] csel x10,x23,x10,lo csel x11,x24,x11,lo - stp x8,x9,[x1,#8*2] + stp x8,x9,[PTR(1),#8*2] csel x12,x25,x12,lo csel x13,x26,x13,lo - stp x10,x11,[x1,#8*4] - stp x12,x13,[x1,#8*6] + stp x10,x11,[PTR(1),#8*4] + stp x12,x13,[PTR(1),#8*6] .Lsqr8x_done: - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + mov PTRN(sp),PTR(29) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] mov x0,#1 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(16*PTR_WIDTH) // x30 is loaded earlier AARCH64_VALIDATE_LINK_REGISTER ret @@ -1692,37 +1784,47 @@ __bn_sqr8x_mont: __bn_mul4x_mont: // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - sub x26,sp,x5,lsl#3 + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + +#ifdef __CHERI_PURE_CAPABILITY__ + neg x10,x5 + add PTR(26),csp,x10,lsl#3 +#else + sub PTR(26),sp,x5,lsl#3 +#endif lsl x5,x5,#3 - ldr x4,[x4] // *n0 - sub sp,x26,#8*4 // alloca + ldr x4,[PTR(4)] // *n0 + sub PTRN(sp),PTR(26),#8*4 // alloca - add x10,x2,x5 - add x27,x1,x5 - stp x0,x10,[x29,#96] // offload rp and &b[num] + add PTR(10),PTR(2),x5 + add PTR(27),PTR(1),x5 + stp PTR(0),PTR(10),[PTR(29),#(12*PTR_WIDTH)] // offload rp and &b[num] - ldr x24,[x2,#8*0] // b[0] - ldp x6,x7,[x1,#8*0] // a[0..3] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 + ldr x24,[PTR(2),#8*0] // b[0] + ldp x6,x7,[PTR(1),#8*0] // a[0..3] + ldp x8,x9,[PTR(1),#8*2] + add PTR(1),PTR(1),#8*4 mov x19,xzr mov x20,xzr mov x21,xzr mov x22,xzr - ldp x14,x15,[x3,#8*0] // n[0..3] - ldp x16,x17,[x3,#8*2] - adds x3,x3,#8*4 // clear carry bit + ldp x14,x15,[PTR(3),#8*0] // n[0..3] + ldp x16,x17,[PTR(3),#8*2] +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(3),PTR(3),#8*4 + cmn x0,xzr // clear carry bit +#else + adds PTR(3),PTR(3),#8*4 // clear carry bit +#endif mov x0,xzr mov x28,#0 - mov x26,sp + mov PTR(26),PTRN(sp) .Loop_mul4x_1st_reduction: mul x10,x6,x24 // lo(a[0..3]*b[0]) @@ -1742,10 +1844,10 @@ __bn_mul4x_mont: umulh x12,x8,x24 adc x23,xzr,xzr umulh x13,x9,x24 - ldr x24,[x2,x28] // next b[i] (or b[0]) + ldr x24,[PTR(2),x28] // next b[i] (or b[0]) adds x20,x20,x10 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) - str x25,[x26],#8 // put aside t[0]*n0 for tail processing + str x25,[PTR(26)],#8 // put aside t[0]*n0 for tail processing adcs x21,x21,x11 mul x11,x15,x25 adcs x22,x22,x12 @@ -1773,13 +1875,13 @@ __bn_mul4x_mont: cbz x10,.Lmul4x4_post_condition - ldp x6,x7,[x1,#8*0] // a[4..7] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - ldr x25,[sp] // a[0]*n0 - ldp x14,x15,[x3,#8*0] // n[4..7] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 + ldp x6,x7,[PTR(1),#8*0] // a[4..7] + ldp x8,x9,[PTR(1),#8*2] + add PTR(1),PTR(1),#8*4 + ldr x25,[PTRN(sp)] // a[0]*n0 + ldp x14,x15,[PTR(3),#8*0] // n[4..7] + ldp x16,x17,[PTR(3),#8*2] + add PTR(3),PTR(3),#8*4 .Loop_mul4x_1st_tail: mul x10,x6,x24 // lo(a[4..7]*b[i]) @@ -1798,7 +1900,7 @@ __bn_mul4x_mont: adcs x22,x22,x13 umulh x13,x9,x24 adc x23,xzr,xzr - ldr x24,[x2,x28] // next b[i] (or b[0]) + ldr x24,[PTR(2),x28] // next b[i] (or b[0]) adds x20,x20,x10 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) adcs x21,x21,x11 @@ -1817,45 +1919,60 @@ __bn_mul4x_mont: adcs x23,x23,x0 umulh x13,x17,x25 adc x0,xzr,xzr - ldr x25,[sp,x28] // next t[0]*n0 - str x19,[x26],#8 // result!!! + ldr x25,[PTRN(sp),x28] // next t[0]*n0 + str x19,[PTR(26)],#8 // result!!! adds x19,x20,x10 - sub x10,x27,x1 // done yet? + sub x10,x27,x1 // done yet? adcs x20,x21,x11 adcs x21,x22,x12 adcs x22,x23,x13 //adc x0,x0,xzr cbnz x28,.Loop_mul4x_1st_tail - sub x11,x27,x5 // rewinded x1 +#ifdef __CHERI_PURE_CAPABILITY__ + neg x11,x5 + add PTR(11),PTR(27),x11 // rewinded PTR(1) +#else + sub PTR(11),PTR(27),x5 // rewinded PTR(1) +#endif cbz x10,.Lmul4x_proceed - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - ldp x14,x15,[x3,#8*0] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 + ldp x6,x7,[PTR(1),#8*0] + ldp x8,x9,[PTR(1),#8*2] + add PTR(1),PTR(1),#8*4 + ldp x14,x15,[PTR(3),#8*0] + ldp x16,x17,[PTR(3),#8*2] + add PTR(3),PTR(3),#8*4 b .Loop_mul4x_1st_tail .align 5 .Lmul4x_proceed: - ldr x24,[x2,#8*4]! // *++b + ldr x24,[PTR(2),#8*4]! // *++b adc x30,x0,xzr - ldp x6,x7,[x11,#8*0] // a[0..3] - sub x3,x3,x5 // rewind np - ldp x8,x9,[x11,#8*2] - add x1,x11,#8*4 - - stp x19,x20,[x26,#8*0] // result!!! - ldp x19,x20,[sp,#8*4] // t[0..3] - stp x21,x22,[x26,#8*2] // result!!! - ldp x21,x22,[sp,#8*6] - - ldp x14,x15,[x3,#8*0] // n[0..3] - mov x26,sp - ldp x16,x17,[x3,#8*2] - adds x3,x3,#8*4 // clear carry bit + ldp x6,x7,[PTR(11),#8*0] // a[0..3] +#ifdef __CHERI_PURE_CAPABILITY__ + neg x10,x5 + add PTR(3),PTR(3),x10 // rewind np +#else + sub PTR(3),PTR(3),x5 // rewind np +#endif + ldp x8,x9,[PTR(11),#8*2] + add PTR(1),PTR(11),#8*4 + + stp x19,x20,[PTR(26),#8*0] // result!!! + ldp x19,x20,[PTRN(sp),#8*4] // t[0..3] + stp x21,x22,[PTR(26),#8*2] // result!!! + ldp x21,x22,[PTRN(sp),#8*6] + + ldp x14,x15,[PTR(3),#8*0] // n[0..3] + mov PTR(26),PTRN(sp) + ldp x16,x17,[PTR(3),#8*2] +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(3),PTR(3),#8*4 + cmn x0,xzr // clear carry bit +#else + adds PTR(3),PTR(3),#8*4 // clear carry bit +#endif mov x0,xzr .align 4 @@ -1877,10 +1994,10 @@ __bn_mul4x_mont: umulh x12,x8,x24 adc x23,xzr,xzr umulh x13,x9,x24 - ldr x24,[x2,x28] // next b[i] + ldr x24,[PTR(2),x28] // next b[i] adds x20,x20,x10 // (*) mul x10,x14,x25 - str x25,[x26],#8 // put aside t[0]*n0 for tail processing + str x25,[PTR(26)],#8 // put aside t[0]*n0 for tail processing adcs x21,x21,x11 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 adcs x22,x22,x12 @@ -1906,21 +2023,21 @@ __bn_mul4x_mont: cbnz x28,.Loop_mul4x_reduction adc x0,x0,xzr - ldp x10,x11,[x26,#8*4] // t[4..7] - ldp x12,x13,[x26,#8*6] - ldp x6,x7,[x1,#8*0] // a[4..7] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 + ldp x10,x11,[PTR(26),#8*4] // t[4..7] + ldp x12,x13,[PTR(26),#8*6] + ldp x6,x7,[PTR(1),#8*0] // a[4..7] + ldp x8,x9,[PTR(1),#8*2] + add PTR(1),PTR(1),#8*4 adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr - ldr x25,[sp] // t[0]*n0 - ldp x14,x15,[x3,#8*0] // n[4..7] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 + ldr x25,[PTRN(sp)] // t[0]*n0 + ldp x14,x15,[PTR(3),#8*0] // n[4..7] + ldp x16,x17,[PTR(3),#8*2] + add PTR(3),PTR(3),#8*4 .align 4 .Loop_mul4x_tail: @@ -1940,7 +2057,7 @@ __bn_mul4x_mont: adcs x22,x22,x13 umulh x13,x9,x24 adc x23,xzr,xzr - ldr x24,[x2,x28] // next b[i] + ldr x24,[PTR(2),x28] // next b[i] adds x20,x20,x10 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) adcs x21,x21,x11 @@ -1958,62 +2075,77 @@ __bn_mul4x_mont: adcs x22,x22,x13 umulh x13,x17,x25 adcs x23,x23,x0 - ldr x25,[sp,x28] // next a[0]*n0 + ldr x25,[PTRN(sp),x28] // next a[0]*n0 adc x0,xzr,xzr - str x19,[x26],#8 // result!!! + str x19,[PTR(26)],#8 // result!!! adds x19,x20,x10 - sub x10,x27,x1 // done yet? + sub x10,x27,x1 // done yet? adcs x20,x21,x11 adcs x21,x22,x12 adcs x22,x23,x13 //adc x0,x0,xzr cbnz x28,.Loop_mul4x_tail - sub x11,x3,x5 // rewinded np? +#ifdef __CHERI_PURE_CAPABILITY__ + neg x11,x5 + add PTR(11),PTR(3),x11 // rewinded np? +#else + sub PTR(11),PTR(3),x5 // rewinded np? +#endif adc x0,x0,xzr cbz x10,.Loop_mul4x_break - ldp x10,x11,[x26,#8*4] - ldp x12,x13,[x26,#8*6] - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 + ldp x10,x11,[PTR(26),#8*4] + ldp x12,x13,[PTR(26),#8*6] + ldp x6,x7,[PTR(1),#8*0] + ldp x8,x9,[PTR(1),#8*2] + add PTR(1),PTR(1),#8*4 adds x19,x19,x10 adcs x20,x20,x11 adcs x21,x21,x12 adcs x22,x22,x13 //adc x0,x0,xzr - ldp x14,x15,[x3,#8*0] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 + ldp x14,x15,[PTR(3),#8*0] + ldp x16,x17,[PTR(3),#8*2] + add PTR(3),PTR(3),#8*4 b .Loop_mul4x_tail .align 4 .Loop_mul4x_break: - ldp x12,x13,[x29,#96] // pull rp and &b[num] + ldp PTR(12),PTR(13),[PTR(29),#(12*PTR_WIDTH)] // pull rp and &b[num] adds x19,x19,x30 - add x2,x2,#8*4 // bp++ + add PTR(2),PTR(2),#8*4 // bp++ adcs x20,x20,xzr - sub x1,x1,x5 // rewind ap +#ifdef __CHERI_PURE_CAPABILITY__ + neg x10,x5 + add PTR(1),PTR(1),x10 // rewind ap +#else + sub PTR(1),PTR(1),x5 // rewind ap +#endif adcs x21,x21,xzr - stp x19,x20,[x26,#8*0] // result!!! + stp x19,x20,[PTR(26),#8*0] // result!!! adcs x22,x22,xzr - ldp x19,x20,[sp,#8*4] // t[0..3] + ldp x19,x20,[PTRN(sp),#8*4] // t[0..3] adc x30,x0,xzr - stp x21,x22,[x26,#8*2] // result!!! - cmp x2,x13 // done yet? - ldp x21,x22,[sp,#8*6] - ldp x14,x15,[x11,#8*0] // n[0..3] - ldp x16,x17,[x11,#8*2] - add x3,x11,#8*4 + stp x21,x22,[PTR(26),#8*2] // result!!! + cmp PTR(2),PTR(13) // done yet? + ldp x21,x22,[PTRN(sp),#8*6] + ldp x14,x15,[PTR(11),#8*0] // n[0..3] + ldp x16,x17,[PTR(11),#8*2] + add PTR(3),PTR(11),#8*4 b.eq .Lmul4x_post - ldr x24,[x2] - ldp x6,x7,[x1,#8*0] // a[0..3] - ldp x8,x9,[x1,#8*2] - adds x1,x1,#8*4 // clear carry bit + ldr x24,[PTR(2)] + ldp x6,x7,[PTR(1),#8*0] // a[0..3] + ldp x8,x9,[PTR(1),#8*2] +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(1),PTR(1),#8*4 + cmn x0,xzr // clear carry bit +#else + adds PTR(1),PTR(1),#8*4 // clear carry bit +#endif mov x0,xzr - mov x26,sp + mov PTR(26),PTRN(sp) b .Loop_mul4x_reduction .align 4 @@ -2022,109 +2154,109 @@ __bn_mul4x_mont: // if it is, subtract the modulus. But comparison implies // subtraction. So we subtract modulus, see if it borrowed, // and conditionally copy original value. - mov x0,x12 - mov x27,x12 // x0 copy + mov PTR(0),PTR(12) + mov PTR(27),PTR(12) // PTR(0) copy subs x10,x19,x14 - add x26,sp,#8*8 + add PTR(26),PTRN(sp),#8*8 sbcs x11,x20,x15 sub x28,x5,#8*4 .Lmul4x_sub: sbcs x12,x21,x16 - ldp x14,x15,[x3,#8*0] + ldp x14,x15,[PTR(3),#8*0] sub x28,x28,#8*4 - ldp x19,x20,[x26,#8*0] + ldp x19,x20,[PTR(26),#8*0] sbcs x13,x22,x17 - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - ldp x21,x22,[x26,#8*2] - add x26,x26,#8*4 - stp x10,x11,[x0,#8*0] + ldp x16,x17,[PTR(3),#8*2] + add PTR(3),PTR(3),#8*4 + ldp x21,x22,[PTR(26),#8*2] + add PTR(26),PTR(26),#8*4 + stp x10,x11,[PTR(0),#8*0] sbcs x10,x19,x14 - stp x12,x13,[x0,#8*2] - add x0,x0,#8*4 + stp x12,x13,[PTR(0),#8*2] + add PTR(0),PTR(0),#8*4 sbcs x11,x20,x15 cbnz x28,.Lmul4x_sub sbcs x12,x21,x16 - mov x26,sp - add x1,sp,#8*4 - ldp x6,x7,[x27,#8*0] + mov PTR(26),PTRN(sp) + add PTR(1),PTRN(sp),#8*4 + ldp x6,x7,[PTR(27),#8*0] sbcs x13,x22,x17 - stp x10,x11,[x0,#8*0] - ldp x8,x9,[x27,#8*2] - stp x12,x13,[x0,#8*2] - ldp x19,x20,[x1,#8*0] - ldp x21,x22,[x1,#8*2] + stp x10,x11,[PTR(0),#8*0] + ldp x8,x9,[PTR(27),#8*2] + stp x12,x13,[PTR(0),#8*2] + ldp x19,x20,[PTR(1),#8*0] + ldp x21,x22,[PTR(1),#8*2] sbcs xzr,x30,xzr // did it borrow? - ldr x30,[x29,#8] // pull return address + ldr PTR(30),[PTR(29),#PTR_WIDTH] // pull return address sub x28,x5,#8*4 .Lmul4x_cond_copy: sub x28,x28,#8*4 csel x10,x19,x6,lo - stp xzr,xzr,[x26,#8*0] + stp xzr,xzr,[PTR(26),#8*0] csel x11,x20,x7,lo - ldp x6,x7,[x27,#8*4] - ldp x19,x20,[x1,#8*4] + ldp x6,x7,[PTR(27),#8*4] + ldp x19,x20,[PTR(1),#8*4] csel x12,x21,x8,lo - stp xzr,xzr,[x26,#8*2] - add x26,x26,#8*4 + stp xzr,xzr,[PTR(26),#8*2] + add PTR(26),PTR(26),#8*4 csel x13,x22,x9,lo - ldp x8,x9,[x27,#8*6] - ldp x21,x22,[x1,#8*6] - add x1,x1,#8*4 - stp x10,x11,[x27,#8*0] - stp x12,x13,[x27,#8*2] - add x27,x27,#8*4 + ldp x8,x9,[PTR(27),#8*6] + ldp x21,x22,[PTR(1),#8*6] + add PTR(1),PTR(1),#8*4 + stp x10,x11,[PTR(27),#8*0] + stp x12,x13,[PTR(27),#8*2] + add PTR(27),PTR(27),#8*4 cbnz x28,.Lmul4x_cond_copy csel x10,x19,x6,lo - stp xzr,xzr,[x26,#8*0] + stp xzr,xzr,[PTR(26),#8*0] csel x11,x20,x7,lo - stp xzr,xzr,[x26,#8*2] + stp xzr,xzr,[PTR(26),#8*2] csel x12,x21,x8,lo - stp xzr,xzr,[x26,#8*3] + stp xzr,xzr,[PTR(26),#8*3] csel x13,x22,x9,lo - stp xzr,xzr,[x26,#8*4] - stp x10,x11,[x27,#8*0] - stp x12,x13,[x27,#8*2] + stp xzr,xzr,[PTR(26),#8*4] + stp x10,x11,[PTR(27),#8*0] + stp x12,x13,[PTR(27),#8*2] b .Lmul4x_done .align 4 .Lmul4x4_post_condition: adc x0,x0,xzr - ldr x1,[x29,#96] // pull rp + ldr PTR(1),[PTR(29),#(12*PTR_WIDTH)] // pull rp // x19-3,x0 hold result, x14-7 hold modulus subs x6,x19,x14 - ldr x30,[x29,#8] // pull return address + ldr PTR(30),[PTR(29),#PTR_WIDTH] // pull return address sbcs x7,x20,x15 - stp xzr,xzr,[sp,#8*0] + stp xzr,xzr,[PTRN(sp),#8*0] sbcs x8,x21,x16 - stp xzr,xzr,[sp,#8*2] + stp xzr,xzr,[PTRN(sp),#8*2] sbcs x9,x22,x17 - stp xzr,xzr,[sp,#8*4] + stp xzr,xzr,[PTRN(sp),#8*4] sbcs xzr,x0,xzr // did it borrow? - stp xzr,xzr,[sp,#8*6] + stp xzr,xzr,[PTRN(sp),#8*6] // x6-3 hold result-modulus csel x6,x19,x6,lo csel x7,x20,x7,lo csel x8,x21,x8,lo csel x9,x22,x9,lo - stp x6,x7,[x1,#8*0] - stp x8,x9,[x1,#8*2] + stp x6,x7,[PTR(1),#8*0] + stp x8,x9,[PTR(1),#8*2] .Lmul4x_done: - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + mov PTRN(sp),PTR(29) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] mov x0,#1 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(16*PTR_WIDTH) // x30 loaded earlier AARCH64_VALIDATE_LINK_REGISTER ret diff --git a/sys/crypto/openssl/aarch64/chacha-armv8.S b/sys/crypto/openssl/aarch64/chacha-armv8.S index 4f9d6bd372f7..2e0cc3f89160 100644 --- a/sys/crypto/openssl/aarch64/chacha-armv8.S +++ b/sys/crypto/openssl/aarch64/chacha-armv8.S @@ -27,28 +27,28 @@ ChaCha20_ctr32: b.lo .Lshort #ifndef __KERNEL__ - adrp x17,OPENSSL_armcap_P - ldr w17,[x17,#:lo12:OPENSSL_armcap_P] + adrp PTR(17),OPENSSL_armcap_P + ldr w17,[PTR(17),#:lo12:OPENSSL_armcap_P] tst w17,#ARMV7_NEON b.ne .LChaCha20_neon #endif .Lshort: - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ldp x28,x30,[x4] // load counter + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + + adr PTR(5),.Lsigma + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#64 + + ldp x22,x23,[PTR(5)] // load sigma + ldp x24,x25,[PTR(3)] // load key + ldp x26,x27,[PTR(3),#16] + ldp x28,x30,[PTR(4)] // load counter #ifdef __AARCH64EB__ ror x24,x24,#32 ror x25,x25,#32 @@ -199,17 +199,17 @@ ChaCha20_ctr32: add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input + ldp x6,x8,[PTR(1),#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] + ldp x10,x12,[PTR(1),#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] + ldp x14,x16,[PTR(1),#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 + ldp x19,x21,[PTR(1),#48] + add PTR(1),PTR(1),#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 @@ -229,22 +229,22 @@ ChaCha20_ctr32: eor x17,x17,x19 eor x20,x20,x21 - stp x5,x7,[x0,#0] // store output + stp x5,x7,[PTR(0),#0] // store output add x28,x28,#1 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 + stp x9,x11,[PTR(0),#16] + stp x13,x15,[PTR(0),#32] + stp x17,x20,[PTR(0),#48] + add PTR(0),PTR(0),#64 b.hi .Loop_outer - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) .Labort: AARCH64_VALIDATE_LINK_REGISTER ret @@ -253,10 +253,10 @@ ChaCha20_ctr32: .Ltail: add x2,x2,#64 .Less_than_64: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 + sub PTR(0),PTR(0),#1 + add PTR(1),PTR(1),x2 + add PTR(0),PTR(0),x2 + add PTR(4),PTRN(sp),x2 neg x2,x2 add x5,x5,x6,lsl#32 // pack @@ -277,31 +277,31 @@ ChaCha20_ctr32: rev x17,x17 rev x20,x20 #endif - stp x5,x7,[sp,#0] - stp x9,x11,[sp,#16] - stp x13,x15,[sp,#32] - stp x17,x20,[sp,#48] + stp x5,x7,[PTRN(sp),#0] + stp x9,x11,[PTRN(sp),#16] + stp x13,x15,[PTRN(sp),#32] + stp x17,x20,[PTRN(sp),#48] .Loop_tail: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] + ldrb w10,[PTR(1),x2] + ldrb w11,[PTR(4),x2] add x2,x2,#1 eor w10,w10,w11 - strb w10,[x0,x2] + strb w10,[PTR(0),x2] cbnz x2,.Loop_tail - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + stp xzr,xzr,[PTRN(sp),#0] + stp xzr,xzr,[PTRN(sp),#16] + stp xzr,xzr,[PTRN(sp),#32] + stp xzr,xzr,[PTRN(sp),#48] + + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_ctr32,.-ChaCha20_ctr32 @@ -314,29 +314,29 @@ ChaCha20_ctr32: ChaCha20_neon: AARCH64_SIGN_LINK_REGISTER .LChaCha20_neon: - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + + adr PTR(5),.Lsigma + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] cmp x2,#512 b.hs .L512_or_more_neon - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ld1 {v0.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v1.4s,v2.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v3.4s},[x4] - stp d8,d9,[sp] // meet ABI requirements - ld1 {v8.4s,v9.4s},[x5] + sub PTRN(sp),PTRN(sp),#64 + + ldp x22,x23,[PTR(5)] // load sigma + ld1 {v0.4s},[PTR(5)],#16 + ldp x24,x25,[PTR(3)] // load key + ldp x26,x27,[PTR(3),#16] + ld1 {v1.4s,v2.4s},[PTR(3)] + ldp x28,x30,[PTR(4)] // load counter + ld1 {v3.4s},[PTR(4)] + stp d8,d9,[PTRN(sp)] // meet ABI requirements + ld1 {v8.4s,v9.4s},[PTR(5)] #ifdef __AARCH64EB__ rev64 v0.4s,v0.4s ror x24,x24,#32 @@ -654,21 +654,21 @@ ChaCha20_neon: add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input + ldp x6,x8,[PTR(1),#0] // load input add v16.4s,v16.4s,v0.4s // accumulate key block add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] + ldp x10,x12,[PTR(1),#16] add v17.4s,v17.4s,v1.4s add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] + ldp x14,x16,[PTR(1),#32] add v18.4s,v18.4s,v2.4s add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] + ldp x19,x21,[PTR(1),#48] add v19.4s,v19.4s,v3.4s - add x1,x1,#64 + add PTR(1),PTR(1),#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 @@ -679,7 +679,7 @@ ChaCha20_neon: rev x17,x17 rev x20,x20 #endif - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[PTR(1)],#64 eor x5,x5,x6 add v20.4s,v20.4s,v0.4s eor x7,x7,x8 @@ -698,79 +698,79 @@ ChaCha20_neon: eor x20,x20,x21 eor v19.16b,v19.16b,v7.16b add v8.4s,v8.4s,v4.4s // += 5 - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[PTR(1)],#64 - stp x5,x7,[x0,#0] // store output + stp x5,x7,[PTR(0),#0] // store output add x28,x28,#5 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 + stp x9,x11,[PTR(0),#16] + stp x13,x15,[PTR(0),#32] + stp x17,x20,[PTR(0),#48] + add PTR(0),PTR(0),#64 - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[PTR(0)],#64 add v24.4s,v24.4s,v0.4s add v25.4s,v25.4s,v1.4s add v26.4s,v26.4s,v2.4s add v27.4s,v27.4s,v3.4s - ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[PTR(1)],#64 eor v20.16b,v20.16b,v4.16b eor v21.16b,v21.16b,v5.16b eor v22.16b,v22.16b,v6.16b eor v23.16b,v23.16b,v7.16b - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[PTR(0)],#64 add v28.4s,v28.4s,v0.4s add v29.4s,v29.4s,v1.4s add v30.4s,v30.4s,v2.4s add v31.4s,v31.4s,v3.4s - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[PTR(1)],#64 eor v24.16b,v24.16b,v16.16b eor v25.16b,v25.16b,v17.16b eor v26.16b,v26.16b,v18.16b eor v27.16b,v27.16b,v19.16b - st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[PTR(0)],#64 eor v28.16b,v28.16b,v20.16b eor v29.16b,v29.16b,v21.16b eor v30.16b,v30.16b,v22.16b eor v31.16b,v31.16b,v23.16b - st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 + st1 {v28.16b,v29.16b,v30.16b,v31.16b},[PTR(0)],#64 b.hi .Loop_outer_neon - ldp d8,d9,[sp] // meet ABI requirements + ldp d8,d9,[PTRN(sp)] // meet ABI requirements - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .align 4 .Ltail_neon: add x2,x2,#320 - ldp d8,d9,[sp] // meet ABI requirements + ldp d8,d9,[PTRN(sp)] // meet ABI requirements cmp x2,#64 b.lo .Less_than_64 add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input + ldp x6,x8,[PTR(1),#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] + ldp x10,x12,[PTR(1),#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] + ldp x14,x16,[PTR(1),#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 + ldp x19,x21,[PTR(1),#48] + add PTR(1),PTR(1),#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 @@ -790,26 +790,26 @@ ChaCha20_neon: eor x17,x17,x19 eor x20,x20,x21 - stp x5,x7,[x0,#0] // store output + stp x5,x7,[PTR(0),#0] // store output add v16.4s,v16.4s,v0.4s // accumulate key block - stp x9,x11,[x0,#16] + stp x9,x11,[PTR(0),#16] add v17.4s,v17.4s,v1.4s - stp x13,x15,[x0,#32] + stp x13,x15,[PTR(0),#32] add v18.4s,v18.4s,v2.4s - stp x17,x20,[x0,#48] + stp x17,x20,[PTR(0),#48] add v19.4s,v19.4s,v3.4s - add x0,x0,#64 + add PTR(0),PTR(0),#64 b.eq .Ldone_neon sub x2,x2,#64 cmp x2,#64 b.lo .Last_neon - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[PTR(1)],#64 eor v16.16b,v16.16b,v4.16b eor v17.16b,v17.16b,v5.16b eor v18.16b,v18.16b,v6.16b eor v19.16b,v19.16b,v7.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[PTR(0)],#64 b.eq .Ldone_neon add v16.4s,v20.4s,v0.4s @@ -820,12 +820,12 @@ ChaCha20_neon: add v19.4s,v23.4s,v3.4s b.lo .Last_neon - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[PTR(1)],#64 eor v20.16b,v16.16b,v4.16b eor v21.16b,v17.16b,v5.16b eor v22.16b,v18.16b,v6.16b eor v23.16b,v19.16b,v7.16b - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[PTR(0)],#64 b.eq .Ldone_neon add v16.4s,v24.4s,v0.4s @@ -836,12 +836,12 @@ ChaCha20_neon: add v19.4s,v27.4s,v3.4s b.lo .Last_neon - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[PTR(1)],#64 eor v24.16b,v16.16b,v4.16b eor v25.16b,v17.16b,v5.16b eor v26.16b,v18.16b,v6.16b eor v27.16b,v19.16b,v7.16b - st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[PTR(0)],#64 b.eq .Ldone_neon add v16.4s,v28.4s,v0.4s @@ -851,35 +851,35 @@ ChaCha20_neon: sub x2,x2,#64 .Last_neon: - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[PTRN(sp)] - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 + sub PTR(0),PTR(0),#1 + add PTR(1),PTR(1),x2 + add PTR(0),PTR(0),x2 + add PTR(4),PTRN(sp),x2 neg x2,x2 .Loop_tail_neon: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] + ldrb w10,[PTR(1),x2] + ldrb w11,[PTR(4),x2] add x2,x2,#1 eor w10,w10,w11 - strb w10,[x0,x2] + strb w10,[PTR(0),x2] cbnz x2,.Loop_tail_neon - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] + stp xzr,xzr,[PTRN(sp),#0] + stp xzr,xzr,[PTRN(sp),#16] + stp xzr,xzr,[PTRN(sp),#32] + stp xzr,xzr,[PTRN(sp),#48] .Ldone_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_neon,.-ChaCha20_neon @@ -887,29 +887,29 @@ ChaCha20_neon: .align 5 ChaCha20_512_neon: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - adr x5,.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] + adr PTR(5),.Lsigma + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] .L512_or_more_neon: - sub sp,sp,#128+64 + sub PTRN(sp),PTRN(sp),#128+64 eor v7.16b,v7.16b,v7.16b - ldp x22,x23,[x5] // load sigma - ld1 {v0.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v1.4s,v2.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v3.4s},[x4] - ld1 {v7.s}[0],[x5] - add x3,x5,#16 // .Lrot24 + ldp x22,x23,[PTR(5)] // load sigma + ld1 {v0.4s},[PTR(5)],#16 + ldp x24,x25,[PTR(3)] // load key + ldp x26,x27,[PTR(3),#16] + ld1 {v1.4s,v2.4s},[PTR(3)] + ldp x28,x30,[PTR(4)] // load counter + ld1 {v3.4s},[PTR(4)] + ld1 {v7.s}[0],[PTR(5)] + add PTR(3),PTR(5),#16 // .Lrot24 #ifdef __AARCH64EB__ rev64 v0.4s,v0.4s ror x24,x24,#32 @@ -920,18 +920,18 @@ ChaCha20_512_neon: ror x30,x30,#32 #endif add v3.4s,v3.4s,v7.4s // += 1 - stp q0,q1,[sp,#0] // off-load key block, invariant part + stp q0,q1,[PTRN(sp),#0] // off-load key block, invariant part add v3.4s,v3.4s,v7.4s // not typo - str q2,[sp,#32] + str q2,[PTRN(sp),#32] add v4.4s,v3.4s,v7.4s add v5.4s,v4.4s,v7.4s add v6.4s,v5.4s,v7.4s shl v7.4s,v7.4s,#2 // 1 -> 4 - stp d8,d9,[sp,#128+0] // meet ABI requirements - stp d10,d11,[sp,#128+16] - stp d12,d13,[sp,#128+32] - stp d14,d15,[sp,#128+48] + stp d8,d9,[PTRN(sp),#128+0] // meet ABI requirements + stp d10,d11,[PTRN(sp),#128+16] + stp d12,d13,[PTRN(sp),#128+32] + stp d14,d15,[PTRN(sp),#128+48] sub x2,x2,#512 // not typo @@ -975,12 +975,12 @@ ChaCha20_512_neon: mov v22.16b,v2.16b lsr x21,x30,#32 mov v26.16b,v2.16b - stp q3,q4,[sp,#48] // off-load key block, variable part + stp q3,q4,[PTRN(sp),#48] // off-load key block, variable part mov v30.16b,v2.16b - stp q5,q6,[sp,#80] + stp q5,q6,[PTRN(sp),#80] mov x4,#5 - ld1 {v6.4s},[x3] + ld1 {v6.4s},[PTR(3)] subs x2,x2,#512 .Loop_upper_neon: sub x4,x4,#1 @@ -1401,17 +1401,17 @@ ChaCha20_512_neon: add x5,x5,x6,lsl#32 // pack add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input + ldp x6,x8,[PTR(1),#0] // load input add x9,x9,x10,lsl#32 add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] + ldp x10,x12,[PTR(1),#16] add x13,x13,x14,lsl#32 add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] + ldp x14,x16,[PTR(1),#32] add x17,x17,x19,lsl#32 add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 + ldp x19,x21,[PTR(1),#48] + add PTR(1),PTR(1),#64 #ifdef __AARCH64EB__ rev x5,x5 rev x7,x7 @@ -1431,18 +1431,18 @@ ChaCha20_512_neon: eor x17,x17,x19 eor x20,x20,x21 - stp x5,x7,[x0,#0] // store output + stp x5,x7,[PTR(0),#0] // store output add x28,x28,#1 // increment counter mov w5,w22 // unpack key block lsr x6,x22,#32 - stp x9,x11,[x0,#16] + stp x9,x11,[PTR(0),#16] mov w7,w23 lsr x8,x23,#32 - stp x13,x15,[x0,#32] + stp x13,x15,[PTR(0),#32] mov w9,w24 lsr x10,x24,#32 - stp x17,x20,[x0,#48] - add x0,x0,#64 + stp x17,x20,[PTR(0),#48] + add PTR(0),PTR(0),#64 mov w11,w25 lsr x12,x25,#32 mov w13,w26 @@ -1856,13 +1856,13 @@ ChaCha20_512_neon: cbnz x4,.Loop_lower_neon add w5,w5,w22 // accumulate key block - ldp q0,q1,[sp,#0] + ldp q0,q1,[PTRN(sp),#0] add x6,x6,x22,lsr#32 - ldp q2,q3,[sp,#32] + ldp q2,q3,[PTRN(sp),#32] add w7,w7,w23 - ldp q4,q5,[sp,#64] + ldp q4,q5,[PTRN(sp),#64] add x8,x8,x23,lsr#32 - ldr q6,[sp,#96] + ldr q6,[PTRN(sp),#96] add v8.4s,v8.4s,v0.4s add w9,w9,w24 add v12.4s,v12.4s,v0.4s @@ -1892,27 +1892,27 @@ ChaCha20_512_neon: add v31.4s,v31.4s,v7.4s // +4 add x7,x7,x8,lsl#32 add v11.4s,v11.4s,v3.4s - ldp x6,x8,[x1,#0] // load input + ldp x6,x8,[PTR(1),#0] // load input add v15.4s,v15.4s,v4.4s add x9,x9,x10,lsl#32 add v19.4s,v19.4s,v5.4s add x11,x11,x12,lsl#32 add v23.4s,v23.4s,v6.4s - ldp x10,x12,[x1,#16] + ldp x10,x12,[PTR(1),#16] add v27.4s,v27.4s,v3.4s add x13,x13,x14,lsl#32 add v31.4s,v31.4s,v4.4s add x15,x15,x16,lsl#32 add v9.4s,v9.4s,v1.4s - ldp x14,x16,[x1,#32] + ldp x14,x16,[PTR(1),#32] add v13.4s,v13.4s,v1.4s add x17,x17,x19,lsl#32 add v17.4s,v17.4s,v1.4s add x20,x20,x21,lsl#32 add v21.4s,v21.4s,v1.4s - ldp x19,x21,[x1,#48] + ldp x19,x21,[PTR(1),#48] add v25.4s,v25.4s,v1.4s - add x1,x1,#64 + add PTR(1),PTR(1),#64 add v29.4s,v29.4s,v1.4s #ifdef __AARCH64EB__ @@ -1925,7 +1925,7 @@ ChaCha20_512_neon: rev x17,x17 rev x20,x20 #endif - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[PTR(1)],#64 eor x5,x5,x6 eor x7,x7,x8 eor x9,x9,x10 @@ -1938,52 +1938,52 @@ ChaCha20_512_neon: eor v10.16b,v10.16b,v2.16b eor x20,x20,x21 eor v11.16b,v11.16b,v3.16b - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[PTR(1)],#64 - stp x5,x7,[x0,#0] // store output + stp x5,x7,[PTR(0),#0] // store output add x28,x28,#7 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + stp x9,x11,[PTR(0),#16] + stp x13,x15,[PTR(0),#32] + stp x17,x20,[PTR(0),#48] + add PTR(0),PTR(0),#64 + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[PTR(0)],#64 - ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[PTR(1)],#64 eor v12.16b,v12.16b,v0.16b eor v13.16b,v13.16b,v1.16b eor v14.16b,v14.16b,v2.16b eor v15.16b,v15.16b,v3.16b - st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[PTR(0)],#64 - ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[PTR(1)],#64 eor v16.16b,v16.16b,v8.16b - ldp q0,q1,[sp,#0] + ldp q0,q1,[PTRN(sp),#0] eor v17.16b,v17.16b,v9.16b - ldp q2,q3,[sp,#32] + ldp q2,q3,[PTRN(sp),#32] eor v18.16b,v18.16b,v10.16b eor v19.16b,v19.16b,v11.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[PTR(0)],#64 - ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[PTR(1)],#64 eor v20.16b,v20.16b,v12.16b eor v21.16b,v21.16b,v13.16b eor v22.16b,v22.16b,v14.16b eor v23.16b,v23.16b,v15.16b - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[PTR(0)],#64 - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[PTR(1)],#64 eor v24.16b,v24.16b,v16.16b eor v25.16b,v25.16b,v17.16b eor v26.16b,v26.16b,v18.16b eor v27.16b,v27.16b,v19.16b - st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + st1 {v24.16b,v25.16b,v26.16b,v27.16b},[PTR(0)],#64 shl v8.4s,v7.4s,#1 // 4 -> 8 eor v28.16b,v28.16b,v20.16b eor v29.16b,v29.16b,v21.16b eor v30.16b,v30.16b,v22.16b eor v31.16b,v31.16b,v23.16b - st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 + st1 {v28.16b,v29.16b,v30.16b,v31.16b},[PTR(0)],#64 add v3.4s,v3.4s,v8.4s // += 8 add v4.4s,v4.4s,v8.4s @@ -1995,24 +1995,24 @@ ChaCha20_512_neon: adds x2,x2,#512 ushr v7.4s,v7.4s,#1 // 4 -> 2 - ldp d10,d11,[sp,#128+16] // meet ABI requirements - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] + ldp d10,d11,[PTRN(sp),#128+16] // meet ABI requirements + ldp d12,d13,[PTRN(sp),#128+32] + ldp d14,d15,[PTRN(sp),#128+48] - stp q0,q0,[sp,#0] // wipe off-load area - stp q0,q0,[sp,#32] - stp q0,q0,[sp,#64] + stp q0,q0,[PTRN(sp),#0] // wipe off-load area + stp q0,q0,[PTRN(sp),#32] + stp q0,q0,[PTRN(sp),#64] b.eq .Ldone_512_neon - sub x3,x3,#16 // .Lone + sub PTR(3),PTR(3),#16 // .Lone cmp x2,#192 - add sp,sp,#128 + add PTRN(sp),PTRN(sp),#128 sub v3.4s,v3.4s,v7.4s // -= 2 - ld1 {v8.4s,v9.4s},[x3] + ld1 {v8.4s,v9.4s},[PTR(3)] b.hs .Loop_outer_neon - ldp d8,d9,[sp,#0] // meet ABI requirements + ldp d8,d9,[PTRN(sp),#0] // meet ABI requirements eor v1.16b,v1.16b,v1.16b eor v2.16b,v2.16b,v2.16b eor v3.16b,v3.16b,v3.16b @@ -2022,14 +2022,14 @@ ChaCha20_512_neon: b .Loop_outer .Ldone_512_neon: - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + ldp d8,d9,[PTRN(sp),#128+0] // meet ABI requirements + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#128+64 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ChaCha20_512_neon,.-ChaCha20_512_neon diff --git a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S index 73c367bcf1fc..48336c0c9891 100644 --- a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S +++ b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S @@ -2397,21 +2397,21 @@ ecp_nistz256_precomputed: .align 6 ecp_nistz256_to_mont: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] + stp PTR(29),PTR(30),[PTRN(sp),#-(4*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] ldr x3,.LRR // bp[0] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] + ldp x4,x5,[PTR(1)] + ldp x6,x7,[PTR(1),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 - adr x2,.LRR // &bp[0] + adr PTR(2),.LRR // &bp[0] bl __ecp_nistz256_mul_mont - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(4*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont @@ -2422,21 +2422,21 @@ ecp_nistz256_to_mont: .align 4 ecp_nistz256_from_mont: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] + stp PTR(29),PTR(30),[PTRN(sp),#-(4*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] mov x3,#1 // bp[0] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] + ldp x4,x5,[PTR(1)] + ldp x6,x7,[PTR(1),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 - adr x2,.Lone // &bp[0] + adr PTR(2),.Lone // &bp[0] bl __ecp_nistz256_mul_mont - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(4*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont @@ -2448,20 +2448,20 @@ ecp_nistz256_from_mont: .align 4 ecp_nistz256_mul_mont: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] + stp PTR(29),PTR(30),[PTRN(sp),#-(4*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] - ldr x3,[x2] // bp[0] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] + ldr x3,[PTR(2)] // bp[0] + ldp x4,x5,[PTR(1)] + ldp x6,x7,[PTR(1),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 bl __ecp_nistz256_mul_mont - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(4*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont @@ -2472,19 +2472,19 @@ ecp_nistz256_mul_mont: .align 4 ecp_nistz256_sqr_mont: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] + stp PTR(29),PTR(30),[PTRN(sp),#-(4*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] + ldp x4,x5,[PTR(1)] + ldp x6,x7,[PTR(1),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 bl __ecp_nistz256_sqr_mont - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(4*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont @@ -2496,19 +2496,19 @@ ecp_nistz256_sqr_mont: .align 4 ecp_nistz256_add: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ldp x14,x15,[x1] - ldp x8,x9,[x2] - ldp x16,x17,[x1,#16] - ldp x10,x11,[x2,#16] + ldp x14,x15,[PTR(1)] + ldp x8,x9,[PTR(2)] + ldp x16,x17,[PTR(1),#16] + ldp x10,x11,[PTR(2),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 bl __ecp_nistz256_add - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_add,.-ecp_nistz256_add @@ -2519,17 +2519,17 @@ ecp_nistz256_add: .align 4 ecp_nistz256_div_by_2: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] + ldp x14,x15,[PTR(1)] + ldp x16,x17,[PTR(1),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 bl __ecp_nistz256_div_by_2 - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 @@ -2540,11 +2540,11 @@ ecp_nistz256_div_by_2: .align 4 ecp_nistz256_mul_by_2: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] + ldp x14,x15,[PTR(1)] + ldp x16,x17,[PTR(1),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 mov x8,x14 @@ -2554,7 +2554,7 @@ ecp_nistz256_mul_by_2: bl __ecp_nistz256_add // ret = a+a // 2*a - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 @@ -2565,11 +2565,11 @@ ecp_nistz256_mul_by_2: .align 4 ecp_nistz256_mul_by_3: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] + ldp x14,x15,[PTR(1)] + ldp x16,x17,[PTR(1),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 mov x8,x14 @@ -2590,7 +2590,7 @@ ecp_nistz256_mul_by_3: bl __ecp_nistz256_add // ret += a // 2*a+a=3*a - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 @@ -2602,17 +2602,17 @@ ecp_nistz256_mul_by_3: .align 4 ecp_nistz256_sub: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] + ldp x14,x15,[PTR(1)] + ldp x16,x17,[PTR(1),#16] ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 bl __ecp_nistz256_sub_from - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_sub,.-ecp_nistz256_sub @@ -2623,10 +2623,10 @@ ecp_nistz256_sub: .align 4 ecp_nistz256_neg: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - mov x2,x1 + mov PTR(2),PTR(1) mov x14,xzr // a = 0 mov x15,xzr mov x16,xzr @@ -2636,7 +2636,7 @@ ecp_nistz256_neg: bl __ecp_nistz256_sub_from - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_neg,.-ecp_nistz256_neg @@ -2657,7 +2657,7 @@ __ecp_nistz256_mul_mont: mul x17,x7,x3 // a[3]*b[0] umulh x11,x7,x3 - ldr x3,[x2,#8] // b[1] + ldr x3,[PTR(2),#8] // b[1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 @@ -2687,7 +2687,7 @@ __ecp_nistz256_mul_mont: adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr - ldr x3,[x2,#8*(1+1)] // b[1+1] + ldr x3,[PTR(2),#8*(1+1)] // b[1+1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 @@ -2716,7 +2716,7 @@ __ecp_nistz256_mul_mont: adcs x17,x17,x11 umulh x11,x7,x3 // hi(a[3]*b[i]) adc x19,x19,xzr - ldr x3,[x2,#8*(2+1)] // b[2+1] + ldr x3,[PTR(2),#8*(2+1)] // b[2+1] adds x15,x15,x8 // accumulate high parts of multiplication lsl x8,x14,#32 adcs x16,x16,x9 @@ -2770,9 +2770,9 @@ __ecp_nistz256_mul_mont: csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo - stp x14,x15,[x0] + stp x14,x15,[PTR(0)] csel x17,x17,x11,lo - stp x16,x17,[x0,#16] + stp x16,x17,[PTR(0),#16] ret .size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont @@ -2893,9 +2893,9 @@ __ecp_nistz256_sqr_mont: csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo - stp x14,x15,[x0] + stp x14,x15,[PTR(0)] csel x17,x17,x11,lo - stp x16,x17,[x0,#16] + stp x16,x17,[PTR(0),#16] ret .size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont @@ -2910,7 +2910,7 @@ __ecp_nistz256_add: adcs x15,x15,x9 adcs x16,x16,x10 adcs x17,x17,x11 - adc x1,xzr,xzr // zap x1 + adc x1,xzr,xzr // zap PTR(1) adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus sbcs x9,x15,x12 @@ -2921,9 +2921,9 @@ __ecp_nistz256_add: csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo - stp x14,x15,[x0] + stp x14,x15,[PTR(0)] csel x17,x17,x11,lo - stp x16,x17,[x0,#16] + stp x16,x17,[PTR(0),#16] ret .size __ecp_nistz256_add,.-__ecp_nistz256_add @@ -2931,26 +2931,26 @@ __ecp_nistz256_add: .type __ecp_nistz256_sub_from,%function .align 4 __ecp_nistz256_sub_from: - ldp x8,x9,[x2] - ldp x10,x11,[x2,#16] + ldp x8,x9,[PTR(2)] + ldp x10,x11,[PTR(2),#16] subs x14,x14,x8 // ret = a-b sbcs x15,x15,x9 sbcs x16,x16,x10 sbcs x17,x17,x11 - sbc x1,xzr,xzr // zap x1 + sbc x1,xzr,xzr // zap PTR(1) subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus adcs x9,x15,x12 adcs x10,x16,xzr adc x11,x17,x13 - cmp x1,xzr // did subtraction borrow? + cmp x1,xzr // did subtraction borrow? csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret csel x15,x15,x9,eq csel x16,x16,x10,eq - stp x14,x15,[x0] + stp x14,x15,[PTR(0)] csel x17,x17,x11,eq - stp x16,x17,[x0,#16] + stp x16,x17,[PTR(0),#16] ret .size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from @@ -2958,26 +2958,26 @@ __ecp_nistz256_sub_from: .type __ecp_nistz256_sub_morf,%function .align 4 __ecp_nistz256_sub_morf: - ldp x8,x9,[x2] - ldp x10,x11,[x2,#16] + ldp x8,x9,[PTR(2)] + ldp x10,x11,[PTR(2),#16] subs x14,x8,x14 // ret = b-a sbcs x15,x9,x15 sbcs x16,x10,x16 sbcs x17,x11,x17 - sbc x1,xzr,xzr // zap x1 + sbc x1,xzr,xzr // zap PTR(1) subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus adcs x9,x15,x12 adcs x10,x16,xzr adc x11,x17,x13 - cmp x1,xzr // did subtraction borrow? + cmp x1,xzr // did subtraction borrow? csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret csel x15,x15,x9,eq csel x16,x16,x10,eq - stp x14,x15,[x0] + stp x14,x15,[PTR(0)] csel x17,x17,x11,eq - stp x16,x17,[x0,#16] + stp x16,x17,[PTR(0),#16] ret .size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf @@ -2989,7 +2989,7 @@ __ecp_nistz256_div_by_2: adcs x9,x15,x12 adcs x10,x16,xzr adcs x11,x17,x13 - adc x1,xzr,xzr // zap x1 + adc x1,xzr,xzr // zap PTR(1) tst x14,#1 // is a even? csel x14,x14,x8,eq // ret = even ? a : a+modulus @@ -3005,9 +3005,9 @@ __ecp_nistz256_div_by_2: lsr x16,x16,#1 orr x16,x16,x17,lsl#63 lsr x17,x17,#1 - stp x14,x15,[x0] + stp x14,x15,[PTR(0)] orr x17,x17,x1,lsl#63 - stp x16,x17,[x0,#16] + stp x16,x17,[PTR(0),#16] ret .size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 @@ -3016,80 +3016,80 @@ __ecp_nistz256_div_by_2: .align 5 ecp_nistz256_point_double: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - sub sp,sp,#32*4 + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#32*4 .Ldouble_shortcut: - ldp x14,x15,[x1,#32] - mov x21,x0 - ldp x16,x17,[x1,#48] - mov x22,x1 + ldp x14,x15,[PTR(1),#32] + mov PTR(21),PTR(0) + ldp x16,x17,[PTR(1),#48] + mov PTR(22),PTR(1) ldr x12,.Lpoly+8 mov x8,x14 ldr x13,.Lpoly+24 mov x9,x15 - ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + ldp x4,x5,[PTR(22),#64] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 - ldp x6,x7,[x22,#64+16] - add x0,sp,#0 + ldp x6,x7,[PTR(22),#64+16] + add PTR(0),PTRN(sp),#0 bl __ecp_nistz256_add // p256_mul_by_2(S, in_y); - add x0,sp,#64 + add PTR(0),PTRN(sp),#64 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); - ldp x8,x9,[x22] - ldp x10,x11,[x22,#16] + ldp x8,x9,[PTR(22)] + ldp x10,x11,[PTR(22),#16] mov x4,x14 // put Zsqr aside for p256_sub mov x5,x15 mov x6,x16 mov x7,x17 - add x0,sp,#32 + add PTR(0),PTRN(sp),#32 bl __ecp_nistz256_add // p256_add(M, Zsqr, in_x); - add x2,x22,#0 + add PTR(2),PTR(22),#0 mov x14,x4 // restore Zsqr mov x15,x5 - ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + ldp x4,x5,[PTRN(sp),#0] // forward load for p256_sqr_mont mov x16,x6 mov x17,x7 - ldp x6,x7,[sp,#0+16] - add x0,sp,#64 + ldp x6,x7,[PTRN(sp),#0+16] + add PTR(0),PTRN(sp),#64 bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); - add x0,sp,#0 + add PTR(0),PTRN(sp),#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); - ldr x3,[x22,#32] - ldp x4,x5,[x22,#64] - ldp x6,x7,[x22,#64+16] - add x2,x22,#32 - add x0,sp,#96 + ldr x3,[PTR(22),#32] + ldp x4,x5,[PTR(22),#64] + ldp x6,x7,[PTR(22),#64+16] + add PTR(2),PTR(22),#32 + add PTR(0),PTRN(sp),#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); mov x8,x14 mov x9,x15 - ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + ldp x4,x5,[PTRN(sp),#0] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 - ldp x6,x7,[sp,#0+16] - add x0,x21,#64 + ldp x6,x7,[PTRN(sp),#0+16] + add PTR(0),PTR(21),#64 bl __ecp_nistz256_add // p256_mul_by_2(res_z, tmp0); - add x0,sp,#96 + add PTR(0),PTRN(sp),#96 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); - ldr x3,[sp,#64] // forward load for p256_mul_mont - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x0,x21,#32 + ldr x3,[PTRN(sp),#64] // forward load for p256_mul_mont + ldp x4,x5,[PTRN(sp),#32] + ldp x6,x7,[PTRN(sp),#32+16] + add PTR(0),PTR(21),#32 bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); - add x2,sp,#64 - add x0,sp,#32 + add PTR(2),PTRN(sp),#64 + add PTR(0),PTRN(sp),#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); mov x8,x14 // duplicate M @@ -3100,56 +3100,56 @@ ecp_nistz256_point_double: mov x5,x15 mov x6,x16 mov x7,x17 - add x0,sp,#32 + add PTR(0),PTRN(sp),#32 bl __ecp_nistz256_add mov x8,x4 // restore M mov x9,x5 - ldr x3,[x22] // forward load for p256_mul_mont + ldr x3,[PTR(22)] // forward load for p256_mul_mont mov x10,x6 - ldp x4,x5,[sp,#0] + ldp x4,x5,[PTRN(sp),#0] mov x11,x7 - ldp x6,x7,[sp,#0+16] + ldp x6,x7,[PTRN(sp),#0+16] bl __ecp_nistz256_add // p256_mul_by_3(M, M); - add x2,x22,#0 - add x0,sp,#0 + add PTR(2),PTR(22),#0 + add PTR(0),PTRN(sp),#0 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); mov x8,x14 mov x9,x15 - ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + ldp x4,x5,[PTRN(sp),#32] // forward load for p256_sqr_mont mov x10,x16 mov x11,x17 - ldp x6,x7,[sp,#32+16] - add x0,sp,#96 + ldp x6,x7,[PTRN(sp),#32+16] + add PTR(0),PTRN(sp),#96 bl __ecp_nistz256_add // p256_mul_by_2(tmp0, S); - add x0,x21,#0 + add PTR(0),PTR(21),#0 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); - add x2,sp,#96 + add PTR(2),PTRN(sp),#96 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); - add x2,sp,#0 - add x0,sp,#0 + add PTR(2),PTRN(sp),#0 + add PTR(0),PTRN(sp),#0 bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); - ldr x3,[sp,#32] + ldr x3,[PTRN(sp),#32] mov x4,x14 // copy S mov x5,x15 mov x6,x16 mov x7,x17 - add x2,sp,#32 + add PTR(2),PTRN(sp),#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); - add x2,x21,#32 - add x0,x21,#32 + add PTR(2),PTR(21),#32 + add PTR(0),PTR(21),#32 bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x29,x30,[sp],#96 + add PTRN(sp),PTR(29),#0 // destroy frame + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_double,.-ecp_nistz256_point_double @@ -3158,20 +3158,20 @@ ecp_nistz256_point_double: .align 5 ecp_nistz256_point_add: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#32*12 - - ldp x4,x5,[x2,#64] // in2_z - ldp x6,x7,[x2,#64+16] - mov x21,x0 - mov x22,x1 - mov x23,x2 + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#32*12 + + ldp x4,x5,[PTR(2),#64] // in2_z + ldp x6,x7,[PTR(2),#64+16] + mov PTR(21),PTR(0) + mov PTR(22),PTR(1) + mov PTR(23),PTR(2) ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 orr x8,x4,x5 @@ -3179,73 +3179,73 @@ ecp_nistz256_point_add: orr x25,x8,x10 cmp x25,#0 csetm x25,ne // ~in2infty - add x0,sp,#192 + add PTR(0),PTRN(sp),#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); - ldp x4,x5,[x22,#64] // in1_z - ldp x6,x7,[x22,#64+16] + ldp x4,x5,[PTR(22),#64] // in1_z + ldp x6,x7,[PTR(22),#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty - add x0,sp,#128 + add PTR(0),PTRN(sp),#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); - ldr x3,[x23,#64] - ldp x4,x5,[sp,#192] - ldp x6,x7,[sp,#192+16] - add x2,x23,#64 - add x0,sp,#320 + ldr x3,[PTR(23),#64] + ldp x4,x5,[PTRN(sp),#192] + ldp x6,x7,[PTRN(sp),#192+16] + add PTR(2),PTR(23),#64 + add PTR(0),PTRN(sp),#320 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); - ldr x3,[x22,#64] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,x22,#64 - add x0,sp,#352 + ldr x3,[PTR(22),#64] + ldp x4,x5,[PTRN(sp),#128] + ldp x6,x7,[PTRN(sp),#128+16] + add PTR(2),PTR(22),#64 + add PTR(0),PTRN(sp),#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); - ldr x3,[x22,#32] - ldp x4,x5,[sp,#320] - ldp x6,x7,[sp,#320+16] - add x2,x22,#32 - add x0,sp,#320 + ldr x3,[PTR(22),#32] + ldp x4,x5,[PTRN(sp),#320] + ldp x6,x7,[PTRN(sp),#320+16] + add PTR(2),PTR(22),#32 + add PTR(0),PTRN(sp),#320 bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); - ldr x3,[x23,#32] - ldp x4,x5,[sp,#352] - ldp x6,x7,[sp,#352+16] - add x2,x23,#32 - add x0,sp,#352 + ldr x3,[PTR(23),#32] + ldp x4,x5,[PTRN(sp),#352] + ldp x6,x7,[PTRN(sp),#352+16] + add PTR(2),PTR(23),#32 + add PTR(0),PTRN(sp),#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); - add x2,sp,#320 - ldr x3,[sp,#192] // forward load for p256_mul_mont - ldp x4,x5,[x22] - ldp x6,x7,[x22,#16] - add x0,sp,#160 + add PTR(2),PTRN(sp),#320 + ldr x3,[PTRN(sp),#192] // forward load for p256_mul_mont + ldp x4,x5,[PTR(22)] + ldp x6,x7,[PTR(22),#16] + add PTR(0),PTRN(sp),#160 bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); orr x14,x14,x15 // see if result is zero orr x16,x16,x17 orr x26,x14,x16 // ~is_equal(S1,S2) - add x2,sp,#192 - add x0,sp,#256 + add PTR(2),PTRN(sp),#192 + add PTR(0),PTRN(sp),#256 bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); - ldr x3,[sp,#128] - ldp x4,x5,[x23] - ldp x6,x7,[x23,#16] - add x2,sp,#128 - add x0,sp,#288 + ldr x3,[PTRN(sp),#128] + ldp x4,x5,[PTR(23)] + ldp x6,x7,[PTR(23),#16] + add PTR(2),PTRN(sp),#128 + add PTR(0),PTRN(sp),#288 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); - add x2,sp,#256 - ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont - ldp x6,x7,[sp,#160+16] - add x0,sp,#96 + add PTR(2),PTRN(sp),#256 + ldp x4,x5,[PTRN(sp),#160] // forward load for p256_sqr_mont + ldp x6,x7,[PTRN(sp),#160+16] + add PTR(0),PTRN(sp),#96 bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); orr x14,x14,x15 // see if result is zero @@ -3260,130 +3260,130 @@ ecp_nistz256_point_add: cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) .Ladd_double: - mov x1,x22 - mov x0,x21 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - add sp,sp,#32*(12-4) // difference in stack frames + mov PTR(1),PTR(22) + mov PTR(0),PTR(21) + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#32*(12-4) // difference in stack frames b .Ldouble_shortcut .align 4 .Ladd_proceed: - add x0,sp,#192 + add PTR(0),PTRN(sp),#192 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); - ldr x3,[x22,#64] - ldp x4,x5,[sp,#96] - ldp x6,x7,[sp,#96+16] - add x2,x22,#64 - add x0,sp,#64 + ldr x3,[PTR(22),#64] + ldp x4,x5,[PTRN(sp),#96] + ldp x6,x7,[PTRN(sp),#96+16] + add PTR(2),PTR(22),#64 + add PTR(0),PTRN(sp),#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); - ldp x4,x5,[sp,#96] - ldp x6,x7,[sp,#96+16] - add x0,sp,#128 + ldp x4,x5,[PTRN(sp),#96] + ldp x6,x7,[PTRN(sp),#96+16] + add PTR(0),PTRN(sp),#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); - ldr x3,[x23,#64] - ldp x4,x5,[sp,#64] - ldp x6,x7,[sp,#64+16] - add x2,x23,#64 - add x0,sp,#64 + ldr x3,[PTR(23),#64] + ldp x4,x5,[PTRN(sp),#64] + ldp x6,x7,[PTRN(sp),#64+16] + add PTR(2),PTR(23),#64 + add PTR(0),PTRN(sp),#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); - ldr x3,[sp,#96] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,sp,#96 - add x0,sp,#224 + ldr x3,[PTRN(sp),#96] + ldp x4,x5,[PTRN(sp),#128] + ldp x6,x7,[PTRN(sp),#128+16] + add PTR(2),PTRN(sp),#96 + add PTR(0),PTRN(sp),#224 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); - ldr x3,[sp,#128] - ldp x4,x5,[sp,#256] - ldp x6,x7,[sp,#256+16] - add x2,sp,#128 - add x0,sp,#288 + ldr x3,[PTRN(sp),#128] + ldp x4,x5,[PTRN(sp),#256] + ldp x6,x7,[PTRN(sp),#256+16] + add PTR(2),PTRN(sp),#128 + add PTR(0),PTRN(sp),#288 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 - add x0,sp,#128 + add PTR(0),PTRN(sp),#128 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); - add x2,sp,#192 - add x0,sp,#0 + add PTR(2),PTRN(sp),#192 + add PTR(0),PTRN(sp),#0 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); - add x2,sp,#224 + add PTR(2),PTRN(sp),#224 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); - add x2,sp,#288 - ldr x3,[sp,#224] // forward load for p256_mul_mont - ldp x4,x5,[sp,#320] - ldp x6,x7,[sp,#320+16] - add x0,sp,#32 + add PTR(2),PTRN(sp),#288 + ldr x3,[PTRN(sp),#224] // forward load for p256_mul_mont + ldp x4,x5,[PTRN(sp),#320] + ldp x6,x7,[PTRN(sp),#320+16] + add PTR(0),PTRN(sp),#32 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); - add x2,sp,#224 - add x0,sp,#352 + add PTR(2),PTRN(sp),#224 + add PTR(0),PTRN(sp),#352 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); - ldr x3,[sp,#160] - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x2,sp,#160 - add x0,sp,#32 + ldr x3,[PTRN(sp),#160] + ldp x4,x5,[PTRN(sp),#32] + ldp x6,x7,[PTRN(sp),#32+16] + add PTR(2),PTRN(sp),#160 + add PTR(0),PTRN(sp),#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); - add x2,sp,#352 + add PTR(2),PTRN(sp),#352 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); - ldp x4,x5,[sp,#0] // res - ldp x6,x7,[sp,#0+16] - ldp x8,x9,[x23] // in2 - ldp x10,x11,[x23,#16] - ldp x14,x15,[x22,#0] // in1 + ldp x4,x5,[PTRN(sp),#0] // res + ldp x6,x7,[PTRN(sp),#0+16] + ldp x8,x9,[PTR(23)] // in2 + ldp x10,x11,[PTR(23),#16] + ldp x14,x15,[PTR(22),#0] // in1 cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#0+16] + ldp x16,x17,[PTR(22),#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+0+32] // res + ldp x4,x5,[PTRN(sp),#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+0+48] + ldp x6,x7,[PTRN(sp),#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne - ldp x8,x9,[x23,#0+32] // in2 + ldp x8,x9,[PTR(23),#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne - ldp x10,x11,[x23,#0+48] - stp x14,x15,[x21,#0] - stp x16,x17,[x21,#0+16] - ldp x14,x15,[x22,#32] // in1 + ldp x10,x11,[PTR(23),#0+48] + stp x14,x15,[PTR(21),#0] + stp x16,x17,[PTR(21),#0+16] + ldp x14,x15,[PTR(22),#32] // in1 cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#32+16] + ldp x16,x17,[PTR(22),#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+32+32] // res + ldp x4,x5,[PTRN(sp),#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+32+48] + ldp x6,x7,[PTRN(sp),#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne - ldp x8,x9,[x23,#32+32] // in2 + ldp x8,x9,[PTR(23),#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne - ldp x10,x11,[x23,#32+48] - stp x14,x15,[x21,#32] - stp x16,x17,[x21,#32+16] - ldp x14,x15,[x22,#64] // in1 + ldp x10,x11,[PTR(23),#32+48] + stp x14,x15,[PTR(21),#32] + stp x16,x17,[PTR(21),#32+16] + ldp x14,x15,[PTR(22),#64] // in1 cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#64+16] + ldp x16,x17,[PTR(22),#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne @@ -3393,17 +3393,17 @@ ecp_nistz256_point_add: csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne - stp x14,x15,[x21,#64] - stp x16,x17,[x21,#64+16] + stp x14,x15,[PTR(21),#64] + stp x16,x17,[PTR(21),#64+16] .Ladd_done: - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 + add PTRN(sp),PTR(29),#0 // destroy frame + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(12*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add,.-ecp_nistz256_point_add @@ -3412,32 +3412,32 @@ ecp_nistz256_point_add: .align 5 ecp_nistz256_point_add_affine: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - sub sp,sp,#32*10 - - mov x21,x0 - mov x22,x1 - mov x23,x2 + stp PTR(29),PTR(30),[PTRN(sp),#-(10*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#32*10 + + mov PTR(21),PTR(0) + mov PTR(22),PTR(1) + mov PTR(23),PTR(2) ldr x12,.Lpoly+8 ldr x13,.Lpoly+24 - ldp x4,x5,[x1,#64] // in1_z - ldp x6,x7,[x1,#64+16] + ldp x4,x5,[PTR(1),#64] // in1_z + ldp x6,x7,[PTR(1),#64+16] orr x8,x4,x5 orr x10,x6,x7 orr x24,x8,x10 cmp x24,#0 csetm x24,ne // ~in1infty - ldp x14,x15,[x2] // in2_x - ldp x16,x17,[x2,#16] - ldp x8,x9,[x2,#32] // in2_y - ldp x10,x11,[x2,#48] + ldp x14,x15,[PTR(2)] // in2_x + ldp x16,x17,[PTR(2),#16] + ldp x8,x9,[PTR(2),#32] // in2_y + ldp x10,x11,[PTR(2),#48] orr x14,x14,x15 orr x16,x16,x17 orr x8,x8,x9 @@ -3448,150 +3448,150 @@ ecp_nistz256_point_add_affine: cmp x25,#0 csetm x25,ne // ~in2infty - add x0,sp,#128 + add PTR(0),PTRN(sp),#128 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); mov x4,x14 mov x5,x15 mov x6,x16 mov x7,x17 - ldr x3,[x23] - add x2,x23,#0 - add x0,sp,#96 + ldr x3,[PTR(23)] + add PTR(2),PTR(23),#0 + add PTR(0),PTRN(sp),#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); - add x2,x22,#0 - ldr x3,[x22,#64] // forward load for p256_mul_mont - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x0,sp,#160 + add PTR(2),PTR(22),#0 + ldr x3,[PTR(22),#64] // forward load for p256_mul_mont + ldp x4,x5,[PTRN(sp),#128] + ldp x6,x7,[PTRN(sp),#128+16] + add PTR(0),PTRN(sp),#160 bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); - add x2,x22,#64 - add x0,sp,#128 + add PTR(2),PTR(22),#64 + add PTR(0),PTRN(sp),#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); - ldr x3,[x22,#64] - ldp x4,x5,[sp,#160] - ldp x6,x7,[sp,#160+16] - add x2,x22,#64 - add x0,sp,#64 + ldr x3,[PTR(22),#64] + ldp x4,x5,[PTRN(sp),#160] + ldp x6,x7,[PTRN(sp),#160+16] + add PTR(2),PTR(22),#64 + add PTR(0),PTRN(sp),#64 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); - ldr x3,[x23,#32] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,x23,#32 - add x0,sp,#128 + ldr x3,[PTR(23),#32] + ldp x4,x5,[PTRN(sp),#128] + ldp x6,x7,[PTRN(sp),#128+16] + add PTR(2),PTR(23),#32 + add PTR(0),PTRN(sp),#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); - add x2,x22,#32 - ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont - ldp x6,x7,[sp,#160+16] - add x0,sp,#192 + add PTR(2),PTR(22),#32 + ldp x4,x5,[PTRN(sp),#160] // forward load for p256_sqr_mont + ldp x6,x7,[PTRN(sp),#160+16] + add PTR(0),PTRN(sp),#192 bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); - add x0,sp,#224 + add PTR(0),PTRN(sp),#224 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); - ldp x4,x5,[sp,#192] - ldp x6,x7,[sp,#192+16] - add x0,sp,#288 + ldp x4,x5,[PTRN(sp),#192] + ldp x6,x7,[PTRN(sp),#192+16] + add PTR(0),PTRN(sp),#288 bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); - ldr x3,[sp,#160] - ldp x4,x5,[sp,#224] - ldp x6,x7,[sp,#224+16] - add x2,sp,#160 - add x0,sp,#256 + ldr x3,[PTRN(sp),#160] + ldp x4,x5,[PTRN(sp),#224] + ldp x6,x7,[PTRN(sp),#224+16] + add PTR(2),PTRN(sp),#160 + add PTR(0),PTRN(sp),#256 bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); - ldr x3,[x22] - ldp x4,x5,[sp,#224] - ldp x6,x7,[sp,#224+16] - add x2,x22,#0 - add x0,sp,#96 + ldr x3,[PTR(22)] + ldp x4,x5,[PTRN(sp),#224] + ldp x6,x7,[PTRN(sp),#224+16] + add PTR(2),PTR(22),#0 + add PTR(0),PTRN(sp),#96 bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); mov x8,x14 mov x9,x15 mov x10,x16 mov x11,x17 - add x0,sp,#224 + add PTR(0),PTRN(sp),#224 bl __ecp_nistz256_add // p256_mul_by_2(Hsqr, U2); - add x2,sp,#288 - add x0,sp,#0 + add PTR(2),PTRN(sp),#288 + add PTR(0),PTRN(sp),#0 bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); - add x2,sp,#256 + add PTR(2),PTRN(sp),#256 bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); - add x2,sp,#96 - ldr x3,[x22,#32] // forward load for p256_mul_mont - ldp x4,x5,[sp,#256] - ldp x6,x7,[sp,#256+16] - add x0,sp,#32 + add PTR(2),PTRN(sp),#96 + ldr x3,[PTR(22),#32] // forward load for p256_mul_mont + ldp x4,x5,[PTRN(sp),#256] + ldp x6,x7,[PTRN(sp),#256+16] + add PTR(0),PTRN(sp),#32 bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); - add x2,x22,#32 - add x0,sp,#128 + add PTR(2),PTR(22),#32 + add PTR(0),PTRN(sp),#128 bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); - ldr x3,[sp,#192] - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x2,sp,#192 - add x0,sp,#32 + ldr x3,[PTRN(sp),#192] + ldp x4,x5,[PTRN(sp),#32] + ldp x6,x7,[PTRN(sp),#32+16] + add PTR(2),PTRN(sp),#192 + add PTR(0),PTRN(sp),#32 bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); - add x2,sp,#128 + add PTR(2),PTRN(sp),#128 bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); - ldp x4,x5,[sp,#0] // res - ldp x6,x7,[sp,#0+16] - ldp x8,x9,[x23] // in2 - ldp x10,x11,[x23,#16] - ldp x14,x15,[x22,#0] // in1 + ldp x4,x5,[PTRN(sp),#0] // res + ldp x6,x7,[PTRN(sp),#0+16] + ldp x8,x9,[PTR(23)] // in2 + ldp x10,x11,[PTR(23),#16] + ldp x14,x15,[PTR(22),#0] // in1 cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#0+16] + ldp x16,x17,[PTR(22),#0+16] csel x8,x4,x8,ne csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+0+32] // res + ldp x4,x5,[PTRN(sp),#0+0+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+0+48] + ldp x6,x7,[PTRN(sp),#0+0+48] csel x14,x8,x14,ne csel x15,x9,x15,ne - ldp x8,x9,[x23,#0+32] // in2 + ldp x8,x9,[PTR(23),#0+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne - ldp x10,x11,[x23,#0+48] - stp x14,x15,[x21,#0] - stp x16,x17,[x21,#0+16] - adr x23,.Lone_mont-64 - ldp x14,x15,[x22,#32] // in1 + ldp x10,x11,[PTR(23),#0+48] + stp x14,x15,[PTR(21),#0] + stp x16,x17,[PTR(21),#0+16] + adr PTR(23),.Lone_mont-64 + ldp x14,x15,[PTR(22),#32] // in1 cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#32+16] + ldp x16,x17,[PTR(22),#32+16] csel x8,x4,x8,ne csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+32+32] // res + ldp x4,x5,[PTRN(sp),#0+32+32] // res csel x10,x6,x10,ne csel x11,x7,x11,ne cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+32+48] + ldp x6,x7,[PTRN(sp),#0+32+48] csel x14,x8,x14,ne csel x15,x9,x15,ne - ldp x8,x9,[x23,#32+32] // in2 + ldp x8,x9,[PTR(23),#32+32] // in2 csel x16,x10,x16,ne csel x17,x11,x17,ne - ldp x10,x11,[x23,#32+48] - stp x14,x15,[x21,#32] - stp x16,x17,[x21,#32+16] - ldp x14,x15,[x22,#64] // in1 + ldp x10,x11,[PTR(23),#32+48] + stp x14,x15,[PTR(21),#32] + stp x16,x17,[PTR(21),#32+16] + ldp x14,x15,[PTR(22),#64] // in1 cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#64+16] + ldp x16,x17,[PTR(22),#64+16] csel x8,x4,x8,ne csel x9,x5,x9,ne csel x10,x6,x10,ne @@ -3601,15 +3601,15 @@ ecp_nistz256_point_add_affine: csel x15,x9,x15,ne csel x16,x10,x16,ne csel x17,x11,x17,ne - stp x14,x15,[x21,#64] - stp x16,x17,[x21,#64+16] - - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x29,x30,[sp],#80 + stp x14,x15,[PTR(21),#64] + stp x16,x17,[PTR(21),#64+16] + + add PTRN(sp),PTR(29),#0 // destroy frame + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(10*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine @@ -3622,20 +3622,20 @@ ecp_nistz256_point_add_affine: ecp_nistz256_ord_mul_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] + stp PTR(29),PTR(30),[PTRN(sp),#-(8*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] - adr x23,.Lord - ldr x3,[x2] // bp[0] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] + adr PTR(23),.Lord + ldr x3,[PTR(2)] // bp[0] + ldp x4,x5,[PTR(1)] + ldp x6,x7,[PTR(1),#16] - ldp x12,x13,[x23,#0] - ldp x21,x22,[x23,#16] - ldr x23,[x23,#32] + ldp x12,x13,[PTR(23),#0] + ldp x21,x22,[PTR(23),#16] + ldr x23,[PTR(23),#32] mul x14,x4,x3 // a[0]*b[0] umulh x8,x4,x3 @@ -3656,7 +3656,7 @@ ecp_nistz256_ord_mul_mont: adcs x17,x17,x10 adc x19,x19,xzr mov x20,xzr - ldr x3,[x2,#8*1] // b[i] + ldr x3,[PTR(2),#8*1] // b[i] lsl x8,x24,#32 subs x16,x16,x24 @@ -3698,7 +3698,7 @@ ecp_nistz256_ord_mul_mont: adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr - ldr x3,[x2,#8*2] // b[i] + ldr x3,[PTR(2),#8*2] // b[i] lsl x8,x24,#32 subs x16,x16,x24 @@ -3740,7 +3740,7 @@ ecp_nistz256_ord_mul_mont: adcs x17,x17,x10 adcs x19,x19,x11 adc x20,xzr,xzr - ldr x3,[x2,#8*3] // b[i] + ldr x3,[PTR(2),#8*3] // b[i] lsl x8,x24,#32 subs x16,x16,x24 @@ -3812,14 +3812,14 @@ ecp_nistz256_ord_mul_mont: csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus csel x15,x15,x9,lo csel x16,x16,x10,lo - stp x14,x15,[x0] + stp x14,x15,[PTR(0)] csel x17,x17,x11,lo - stp x16,x17,[x0,#16] + stp x16,x17,[PTR(0),#16] - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldr x29,[sp],#64 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(8*PTR_WIDTH) ret .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont @@ -3832,24 +3832,24 @@ ecp_nistz256_ord_mul_mont: ecp_nistz256_ord_sqr_mont: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - adr x23,.Lord - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - - ldp x12,x13,[x23,#0] - ldp x21,x22,[x23,#16] - ldr x23,[x23,#32] + stp PTR(29),PTR(30),[PTRN(sp),#-(8*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + + adr PTR(23),.Lord + ldp x4,x5,[PTR(1)] + ldp x6,x7,[PTR(1),#16] + + ldp x12,x13,[PTR(23),#0] + ldp x21,x22,[PTR(23),#16] + ldr x23,[PTR(23),#32] b .Loop_ord_sqr .align 4 .Loop_ord_sqr: - sub x2,x2,#1 + sub PTR(2),PTR(2),#1 //////////////////////////////////////////////////////////////// // | | | | | |a1*a0| | // | | | | |a2*a0| | | @@ -4005,13 +4005,13 @@ ecp_nistz256_ord_sqr_mont: cbnz x2,.Loop_ord_sqr - stp x4,x5,[x0] - stp x6,x7,[x0,#16] + stp x4,x5,[PTR(0)] + stp x6,x7,[PTR(0),#16] - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldr x29,[sp],#64 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(8*PTR_WIDTH) ret .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont // void ecp_nistz256_scatter_w5(void *x0,const P256_POINT *x1, @@ -4022,59 +4022,59 @@ ecp_nistz256_ord_sqr_mont: ecp_nistz256_scatter_w5: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - add x0,x0,x2,lsl#2 + add PTR(0),PTR(0),x2,lsl#2 - ldp x4,x5,[x1] // X - ldp x6,x7,[x1,#16] - stur w4,[x0,#64*0-4] + ldp x4,x5,[PTR(1)] // X + ldp x6,x7,[PTR(1),#16] + stur w4,[PTR(0),#64*0-4] lsr x4,x4,#32 - str w5,[x0,#64*1-4] + str w5,[PTR(0),#64*1-4] lsr x5,x5,#32 - str w6,[x0,#64*2-4] + str w6,[PTR(0),#64*2-4] lsr x6,x6,#32 - str w7,[x0,#64*3-4] + str w7,[PTR(0),#64*3-4] lsr x7,x7,#32 - str w4,[x0,#64*4-4] - str w5,[x0,#64*5-4] - str w6,[x0,#64*6-4] - str w7,[x0,#64*7-4] - add x0,x0,#64*8 - - ldp x4,x5,[x1,#32] // Y - ldp x6,x7,[x1,#48] - stur w4,[x0,#64*0-4] + str w4,[PTR(0),#64*4-4] + str w5,[PTR(0),#64*5-4] + str w6,[PTR(0),#64*6-4] + str w7,[PTR(0),#64*7-4] + add PTR(0),PTR(0),#64*8 + + ldp x4,x5,[PTR(1),#32] // Y + ldp x6,x7,[PTR(1),#48] + stur w4,[PTR(0),#64*0-4] lsr x4,x4,#32 - str w5,[x0,#64*1-4] + str w5,[PTR(0),#64*1-4] lsr x5,x5,#32 - str w6,[x0,#64*2-4] + str w6,[PTR(0),#64*2-4] lsr x6,x6,#32 - str w7,[x0,#64*3-4] + str w7,[PTR(0),#64*3-4] lsr x7,x7,#32 - str w4,[x0,#64*4-4] - str w5,[x0,#64*5-4] - str w6,[x0,#64*6-4] - str w7,[x0,#64*7-4] - add x0,x0,#64*8 - - ldp x4,x5,[x1,#64] // Z - ldp x6,x7,[x1,#80] - stur w4,[x0,#64*0-4] + str w4,[PTR(0),#64*4-4] + str w5,[PTR(0),#64*5-4] + str w6,[PTR(0),#64*6-4] + str w7,[PTR(0),#64*7-4] + add PTR(0),PTR(0),#64*8 + + ldp x4,x5,[PTR(1),#64] // Z + ldp x6,x7,[PTR(1),#80] + stur w4,[PTR(0),#64*0-4] lsr x4,x4,#32 - str w5,[x0,#64*1-4] + str w5,[PTR(0),#64*1-4] lsr x5,x5,#32 - str w6,[x0,#64*2-4] + str w6,[PTR(0),#64*2-4] lsr x6,x6,#32 - str w7,[x0,#64*3-4] + str w7,[PTR(0),#64*3-4] lsr x7,x7,#32 - str w4,[x0,#64*4-4] - str w5,[x0,#64*5-4] - str w6,[x0,#64*6-4] - str w7,[x0,#64*7-4] + str w4,[PTR(0),#64*4-4] + str w5,[PTR(0),#64*5-4] + str w6,[PTR(0),#64*6-4] + str w7,[PTR(0),#64*7-4] - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 @@ -4086,23 +4086,23 @@ ecp_nistz256_scatter_w5: ecp_nistz256_gather_w5: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 cmp x2,xzr csetm x3,ne add x2,x2,x3 - add x1,x1,x2,lsl#2 - - ldr w4,[x1,#64*0] - ldr w5,[x1,#64*1] - ldr w6,[x1,#64*2] - ldr w7,[x1,#64*3] - ldr w8,[x1,#64*4] - ldr w9,[x1,#64*5] - ldr w10,[x1,#64*6] - ldr w11,[x1,#64*7] - add x1,x1,#64*8 + add PTR(1),PTR(1),x2,lsl#2 + + ldr w4,[PTR(1),#64*0] + ldr w5,[PTR(1),#64*1] + ldr w6,[PTR(1),#64*2] + ldr w7,[PTR(1),#64*3] + ldr w8,[PTR(1),#64*4] + ldr w9,[PTR(1),#64*5] + ldr w10,[PTR(1),#64*6] + ldr w11,[PTR(1),#64*7] + add PTR(1),PTR(1),#64*8 orr x4,x4,x8,lsl#32 orr x5,x5,x9,lsl#32 orr x6,x6,x10,lsl#32 @@ -4111,18 +4111,18 @@ ecp_nistz256_gather_w5: csel x5,x5,xzr,ne csel x6,x6,xzr,ne csel x7,x7,xzr,ne - stp x4,x5,[x0] // X - stp x6,x7,[x0,#16] - - ldr w4,[x1,#64*0] - ldr w5,[x1,#64*1] - ldr w6,[x1,#64*2] - ldr w7,[x1,#64*3] - ldr w8,[x1,#64*4] - ldr w9,[x1,#64*5] - ldr w10,[x1,#64*6] - ldr w11,[x1,#64*7] - add x1,x1,#64*8 + stp x4,x5,[PTR(0)] // X + stp x6,x7,[PTR(0),#16] + + ldr w4,[PTR(1),#64*0] + ldr w5,[PTR(1),#64*1] + ldr w6,[PTR(1),#64*2] + ldr w7,[PTR(1),#64*3] + ldr w8,[PTR(1),#64*4] + ldr w9,[PTR(1),#64*5] + ldr w10,[PTR(1),#64*6] + ldr w11,[PTR(1),#64*7] + add PTR(1),PTR(1),#64*8 orr x4,x4,x8,lsl#32 orr x5,x5,x9,lsl#32 orr x6,x6,x10,lsl#32 @@ -4131,17 +4131,17 @@ ecp_nistz256_gather_w5: csel x5,x5,xzr,ne csel x6,x6,xzr,ne csel x7,x7,xzr,ne - stp x4,x5,[x0,#32] // Y - stp x6,x7,[x0,#48] - - ldr w4,[x1,#64*0] - ldr w5,[x1,#64*1] - ldr w6,[x1,#64*2] - ldr w7,[x1,#64*3] - ldr w8,[x1,#64*4] - ldr w9,[x1,#64*5] - ldr w10,[x1,#64*6] - ldr w11,[x1,#64*7] + stp x4,x5,[PTR(0),#32] // Y + stp x6,x7,[PTR(0),#48] + + ldr w4,[PTR(1),#64*0] + ldr w5,[PTR(1),#64*1] + ldr w6,[PTR(1),#64*2] + ldr w7,[PTR(1),#64*3] + ldr w8,[PTR(1),#64*4] + ldr w9,[PTR(1),#64*5] + ldr w10,[PTR(1),#64*6] + ldr w11,[PTR(1),#64*7] orr x4,x4,x8,lsl#32 orr x5,x5,x9,lsl#32 orr x6,x6,x10,lsl#32 @@ -4150,10 +4150,10 @@ ecp_nistz256_gather_w5: csel x5,x5,xzr,ne csel x6,x6,xzr,ne csel x7,x7,xzr,ne - stp x4,x5,[x0,#64] // Z - stp x6,x7,[x0,#80] + stp x4,x5,[PTR(0),#64] // Z + stp x6,x7,[PTR(0),#80] - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 @@ -4165,41 +4165,41 @@ ecp_nistz256_gather_w5: ecp_nistz256_scatter_w7: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - add x0,x0,x2 + add PTR(0),PTR(0),x2 mov x2,#64/8 .Loop_scatter_w7: - ldr x3,[x1],#8 + ldr x3,[PTR(1)],#8 subs x2,x2,#1 - prfm pstl1strm,[x0,#4096+64*0] - prfm pstl1strm,[x0,#4096+64*1] - prfm pstl1strm,[x0,#4096+64*2] - prfm pstl1strm,[x0,#4096+64*3] - prfm pstl1strm,[x0,#4096+64*4] - prfm pstl1strm,[x0,#4096+64*5] - prfm pstl1strm,[x0,#4096+64*6] - prfm pstl1strm,[x0,#4096+64*7] - strb w3,[x0,#64*0] + prfm pstl1strm,[PTR(0),#4096+64*0] + prfm pstl1strm,[PTR(0),#4096+64*1] + prfm pstl1strm,[PTR(0),#4096+64*2] + prfm pstl1strm,[PTR(0),#4096+64*3] + prfm pstl1strm,[PTR(0),#4096+64*4] + prfm pstl1strm,[PTR(0),#4096+64*5] + prfm pstl1strm,[PTR(0),#4096+64*6] + prfm pstl1strm,[PTR(0),#4096+64*7] + strb w3,[PTR(0),#64*0] lsr x3,x3,#8 - strb w3,[x0,#64*1] + strb w3,[PTR(0),#64*1] lsr x3,x3,#8 - strb w3,[x0,#64*2] + strb w3,[PTR(0),#64*2] lsr x3,x3,#8 - strb w3,[x0,#64*3] + strb w3,[PTR(0),#64*3] lsr x3,x3,#8 - strb w3,[x0,#64*4] + strb w3,[PTR(0),#64*4] lsr x3,x3,#8 - strb w3,[x0,#64*5] + strb w3,[PTR(0),#64*5] lsr x3,x3,#8 - strb w3,[x0,#64*6] + strb w3,[PTR(0),#64*6] lsr x3,x3,#8 - strb w3,[x0,#64*7] - add x0,x0,#64*8 + strb w3,[PTR(0),#64*7] + add PTR(0),PTR(0),#64*8 b.ne .Loop_scatter_w7 - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 @@ -4211,34 +4211,34 @@ ecp_nistz256_scatter_w7: ecp_nistz256_gather_w7: AARCH64_VALID_CALL_TARGET // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 cmp x2,xzr csetm x3,ne add x2,x2,x3 - add x1,x1,x2 + add PTR(1),PTR(1),x2 mov x2,#64/8 nop .Loop_gather_w7: - ldrb w4,[x1,#64*0] - prfm pldl1strm,[x1,#4096+64*0] + ldrb w4,[PTR(1),#64*0] + prfm pldl1strm,[PTR(1),#4096+64*0] subs x2,x2,#1 - ldrb w5,[x1,#64*1] - prfm pldl1strm,[x1,#4096+64*1] - ldrb w6,[x1,#64*2] - prfm pldl1strm,[x1,#4096+64*2] - ldrb w7,[x1,#64*3] - prfm pldl1strm,[x1,#4096+64*3] - ldrb w8,[x1,#64*4] - prfm pldl1strm,[x1,#4096+64*4] - ldrb w9,[x1,#64*5] - prfm pldl1strm,[x1,#4096+64*5] - ldrb w10,[x1,#64*6] - prfm pldl1strm,[x1,#4096+64*6] - ldrb w11,[x1,#64*7] - prfm pldl1strm,[x1,#4096+64*7] - add x1,x1,#64*8 + ldrb w5,[PTR(1),#64*1] + prfm pldl1strm,[PTR(1),#4096+64*1] + ldrb w6,[PTR(1),#64*2] + prfm pldl1strm,[PTR(1),#4096+64*2] + ldrb w7,[PTR(1),#64*3] + prfm pldl1strm,[PTR(1),#4096+64*3] + ldrb w8,[PTR(1),#64*4] + prfm pldl1strm,[PTR(1),#4096+64*4] + ldrb w9,[PTR(1),#64*5] + prfm pldl1strm,[PTR(1),#4096+64*5] + ldrb w10,[PTR(1),#64*6] + prfm pldl1strm,[PTR(1),#4096+64*6] + ldrb w11,[PTR(1),#64*7] + prfm pldl1strm,[PTR(1),#4096+64*7] + add PTR(1),PTR(1),#64*8 orr x4,x4,x5,lsl#8 orr x6,x6,x7,lsl#8 orr x8,x8,x9,lsl#8 @@ -4247,9 +4247,9 @@ ecp_nistz256_gather_w7: orr x4,x4,x8,lsl#32 orr x4,x4,x10,lsl#48 and x4,x4,x3 - str x4,[x0],#8 + str x4,[PTR(0)],#8 b.ne .Loop_gather_w7 - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 diff --git a/sys/crypto/openssl/aarch64/ghashv8-armx.S b/sys/crypto/openssl/aarch64/ghashv8-armx.S index 42f053d664ef..4b9e5dc75889 100644 --- a/sys/crypto/openssl/aarch64/ghashv8-armx.S +++ b/sys/crypto/openssl/aarch64/ghashv8-armx.S @@ -2,14 +2,16 @@ #include "arm_arch.h" #if __ARM_MAX_ARCH__>=7 +#ifndef __CHERI_PURE_CAPABILITY__ .arch armv8-a+crypto +#endif .text .globl gcm_init_v8 .type gcm_init_v8,%function .align 4 gcm_init_v8: AARCH64_VALID_CALL_TARGET - ld1 {v17.2d},[x1] //load input H + ld1 {v17.2d},[PTR(1)] //load input H movi v19.16b,#0xe1 shl v19.2d,v19.2d,#57 //0xc2.0 ext v3.16b,v17.16b,v17.16b,#8 @@ -24,7 +26,7 @@ gcm_init_v8: and v16.16b,v16.16b,v17.16b orr v3.16b,v3.16b,v18.16b //H<<<=1 eor v20.16b,v3.16b,v16.16b //twisted H - st1 {v20.2d},[x0],#16 //store Htable[0] + st1 {v20.2d},[PTR(0)],#16 //store Htable[0] //calculate H^2 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing @@ -51,7 +53,7 @@ gcm_init_v8: ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + st1 {v21.2d,v22.2d},[PTR(0)],#32 //store Htable[1..2] //calculate H^3 and H^4 pmull v0.1q,v20.1d, v22.1d pmull v5.1q,v22.1d,v22.1d @@ -92,7 +94,7 @@ gcm_init_v8: eor v16.16b,v16.16b,v20.16b eor v17.16b,v17.16b,v22.16b ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + st1 {v20.2d,v21.2d,v22.2d},[PTR(0)] //store Htable[3..5] ret .size gcm_init_v8,.-gcm_init_v8 .globl gcm_gmult_v8 @@ -100,9 +102,9 @@ gcm_init_v8: .align 4 gcm_gmult_v8: AARCH64_VALID_CALL_TARGET - ld1 {v17.2d},[x0] //load Xi + ld1 {v17.2d},[PTR(0)] //load Xi movi v19.16b,#0xe1 - ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + ld1 {v20.2d,v21.2d},[PTR(1)] //load twisted H, ... shl v19.2d,v19.2d,#57 #ifndef __AARCH64EB__ rev64 v17.16b,v17.16b @@ -133,7 +135,7 @@ gcm_gmult_v8: rev64 v0.16b,v0.16b #endif ext v0.16b,v0.16b,v0.16b,#8 - st1 {v0.2d},[x0] //write out Xi + st1 {v0.2d},[PTR(0)] //write out Xi ret .size gcm_gmult_v8,.-gcm_gmult_v8 @@ -144,7 +146,7 @@ gcm_ghash_v8: AARCH64_VALID_CALL_TARGET cmp x3,#64 b.hs .Lgcm_ghash_v8_4x - ld1 {v0.2d},[x0] //load [rotated] Xi + ld1 {v0.2d},[PTR(0)] //load [rotated] Xi //"[rotated]" means that //loaded value would have //to be rotated in order to @@ -160,12 +162,12 @@ gcm_ghash_v8: //last block[s] are actually //loaded twice, but last //copy is not processed - ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + ld1 {v20.2d,v21.2d},[PTR(1)],#32 //load twisted H, ..., H^2 movi v19.16b,#0xe1 - ld1 {v22.2d},[x1] + ld1 {v22.2d},[PTR(1)] csel x12,xzr,x12,eq //is it time to zero x12? ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi - ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + ld1 {v16.2d},[PTR(2)],#16 //load [rotated] I[0] shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant #ifndef __AARCH64EB__ rev64 v16.16b,v16.16b @@ -173,7 +175,7 @@ gcm_ghash_v8: #endif ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] b.lo .Lodd_tail_v8 //x3 was less than 32 - ld1 {v17.2d},[x2],x12 //load [rotated] I[1] + ld1 {v17.2d},[PTR(2)],x12 //load [rotated] I[1] #ifndef __AARCH64EB__ rev64 v17.16b,v17.16b #endif @@ -196,7 +198,7 @@ gcm_ghash_v8: pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi eor v0.16b,v0.16b,v4.16b //accumulate pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) - ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + ld1 {v16.2d},[PTR(2)],x12 //load [rotated] I[i+2] eor v2.16b,v2.16b,v6.16b csel x12,xzr,x12,eq //is it time to zero x12? @@ -205,7 +207,7 @@ gcm_ghash_v8: ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b - ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] + ld1 {v17.2d},[PTR(2)],x12 //load [rotated] I[i+3] #ifndef __AARCH64EB__ rev64 v16.16b,v16.16b #endif @@ -266,7 +268,7 @@ gcm_ghash_v8: rev64 v0.16b,v0.16b #endif ext v0.16b,v0.16b,v0.16b,#8 - st1 {v0.2d},[x0] //write out Xi + st1 {v0.2d},[PTR(0)] //write out Xi ret .size gcm_ghash_v8,.-gcm_ghash_v8 @@ -274,13 +276,13 @@ gcm_ghash_v8: .align 4 gcm_ghash_v8_4x: .Lgcm_ghash_v8_4x: - ld1 {v0.2d},[x0] //load [rotated] Xi - ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + ld1 {v0.2d},[PTR(0)] //load [rotated] Xi + ld1 {v20.2d,v21.2d,v22.2d},[PTR(1)],#48 //load twisted H, ..., H^2 movi v19.16b,#0xe1 - ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + ld1 {v26.2d,v27.2d,v28.2d},[PTR(1)] //load twisted H^3, ..., H^4 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant - ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[PTR(2)],#64 #ifndef __AARCH64EB__ rev64 v0.16b,v0.16b rev64 v5.16b,v5.16b @@ -323,7 +325,7 @@ gcm_ghash_v8_4x: .align 4 .Loop4x: eor v16.16b,v4.16b,v0.16b - ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[PTR(2)],#64 ext v3.16b,v16.16b,v16.16b,#8 #ifndef __AARCH64EB__ rev64 v5.16b,v5.16b @@ -406,7 +408,7 @@ gcm_ghash_v8_4x: ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d,v5.2d,v6.2d},[x2] + ld1 {v4.2d,v5.2d,v6.2d},[PTR(2)] eor v1.16b,v1.16b,v18.16b #ifndef __AARCH64EB__ rev64 v5.16b,v5.16b @@ -458,7 +460,7 @@ gcm_ghash_v8_4x: ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d,v5.2d},[x2] + ld1 {v4.2d,v5.2d},[PTR(2)] eor v1.16b,v1.16b,v18.16b #ifndef __AARCH64EB__ rev64 v5.16b,v5.16b @@ -501,7 +503,7 @@ gcm_ghash_v8_4x: ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d},[x2] + ld1 {v4.2d},[PTR(2)] eor v1.16b,v1.16b,v18.16b #ifndef __AARCH64EB__ rev64 v4.16b,v4.16b @@ -546,7 +548,7 @@ gcm_ghash_v8_4x: #ifndef __AARCH64EB__ rev64 v0.16b,v0.16b #endif - st1 {v0.2d},[x0] //write out Xi + st1 {v0.2d},[PTR(0)] //write out Xi ret .size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x diff --git a/sys/crypto/openssl/aarch64/keccak1600-armv8.S b/sys/crypto/openssl/aarch64/keccak1600-armv8.S index 08b3cc351213..fb95cdadc046 100644 --- a/sys/crypto/openssl/aarch64/keccak1600-armv8.S +++ b/sys/crypto/openssl/aarch64/keccak1600-armv8.S @@ -37,14 +37,14 @@ iotas: .align 5 KeccakF1600_int: AARCH64_SIGN_LINK_REGISTER - adr x28,iotas - stp x28,x30,[sp,#16] // 32 bytes on top are mine + adr PTR(28),iotas + stp PTR(28),PTR(30),[PTRN(sp),#(2*PTR_WIDTH)] // 32 bytes on top are mine b .Loop .align 4 .Loop: ////////////////////////////////////////// Theta eor x26,x0,x5 - stp x4,x9,[sp,#0] // offload pair... + stp x4,x9,[PTRN(sp),#0] // offload pair... eor x27,x1,x6 eor x28,x2,x7 eor x30,x3,x8 @@ -89,7 +89,7 @@ KeccakF1600_int: eor x10,x10,x4 eor x15,x15,x4 eor x20,x20,x4 - ldp x4,x9,[sp,#0] // re-load offloaded data + ldp x4,x9,[PTRN(sp),#0] // re-load offloaded data eor x26, x3,x28 // mov x26,x3 eor x8,x8,x28 eor x13,x13,x28 @@ -145,15 +145,15 @@ KeccakF1600_int: eor x0,x0,x26 bic x26,x4,x3 eor x1,x1,x27 - ldr x27,[sp,#16] + ldr PTR(27),[PTRN(sp),#16] eor x3,x3,x28 eor x4,x4,x30 eor x2,x2,x26 - ldr x30,[x27],#8 // Iota[i++] + ldr x30,[PTR(27)],#8 // Iota[i++] bic x26,x7,x6 tst x27,#255 // are we done? - str x27,[sp,#16] + str x27,[PTRN(sp),#16] bic x27,x8,x7 bic x28,x5,x9 eor x0,x0,x30 // A[0][0] ^= Iota @@ -200,7 +200,7 @@ KeccakF1600_int: bne .Loop - ldr x30,[sp,#24] + ldr PTR(30),[PTRN(sp),#(2*PTR_WIDTH+8)] AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600_int,.-KeccakF1600_int @@ -209,55 +209,55 @@ KeccakF1600_int: .align 5 KeccakF1600: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#48 - - str x0,[sp,#32] // offload argument - mov x26,x0 - ldp x0,x1,[x0,#16*0] - ldp x2,x3,[x26,#16*1] - ldp x4,x5,[x26,#16*2] - ldp x6,x7,[x26,#16*3] - ldp x8,x9,[x26,#16*4] - ldp x10,x11,[x26,#16*5] - ldp x12,x13,[x26,#16*6] - ldp x14,x15,[x26,#16*7] - ldp x16,x17,[x26,#16*8] - ldp x25,x19,[x26,#16*9] - ldp x20,x21,[x26,#16*10] - ldp x22,x23,[x26,#16*11] - ldr x24,[x26,#16*12] + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#(2*PTR_WIDTH+32) + + str PTR(0),[PTRN(sp),#32] // offload argument + mov PTR(26),PTR(0) + ldp x0,x1,[PTR(0),#16*0] + ldp x2,x3,[PTR(26),#16*1] + ldp x4,x5,[PTR(26),#16*2] + ldp x6,x7,[PTR(26),#16*3] + ldp x8,x9,[PTR(26),#16*4] + ldp x10,x11,[PTR(26),#16*5] + ldp x12,x13,[PTR(26),#16*6] + ldp x14,x15,[PTR(26),#16*7] + ldp x16,x17,[PTR(26),#16*8] + ldp x25,x19,[PTR(26),#16*9] + ldp x20,x21,[PTR(26),#16*10] + ldp x22,x23,[PTR(26),#16*11] + ldr x24,[PTR(26),#16*12] bl KeccakF1600_int - ldr x26,[sp,#32] - stp x0,x1,[x26,#16*0] - stp x2,x3,[x26,#16*1] - stp x4,x5,[x26,#16*2] - stp x6,x7,[x26,#16*3] - stp x8,x9,[x26,#16*4] - stp x10,x11,[x26,#16*5] - stp x12,x13,[x26,#16*6] - stp x14,x15,[x26,#16*7] - stp x16,x17,[x26,#16*8] - stp x25,x19,[x26,#16*9] - stp x20,x21,[x26,#16*10] - stp x22,x23,[x26,#16*11] - str x24,[x26,#16*12] - - ldp x19,x20,[x29,#16] - add sp,sp,#48 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 + ldr PTR(26),[PTRN(sp),#32] + stp x0,x1,[PTR(26),#16*0] + stp x2,x3,[PTR(26),#16*1] + stp x4,x5,[PTR(26),#16*2] + stp x6,x7,[PTR(26),#16*3] + stp x8,x9,[PTR(26),#16*4] + stp x10,x11,[PTR(26),#16*5] + stp x12,x13,[PTR(26),#16*6] + stp x14,x15,[PTR(26),#16*7] + stp x16,x17,[PTR(26),#16*8] + stp x25,x19,[PTR(26),#16*9] + stp x20,x21,[PTR(26),#16*10] + stp x22,x23,[PTR(26),#16*11] + str x24,[PTR(26),#16*12] + + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#(2*PTR_WIDTH+32) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(16*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600,.-KeccakF1600 @@ -267,35 +267,35 @@ KeccakF1600: .align 5 SHA3_absorb: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - stp x0,x1,[sp,#32] // offload arguments - stp x2,x3,[sp,#48] - - mov x26,x0 // uint64_t A[5][5] - mov x27,x1 // const void *inp + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#(2*PTR_WIDTH+48) + + stp PTR(0),PTR(1),[PTRN(sp),#32] // offload arguments + stp x2,x3,[PTRN(sp),#(2*PTR_WIDTH+32)] + + mov PTR(26),PTR(0) // uint64_t A[5][5] + mov PTR(27),PTR(1) // const void *inp mov x28,x2 // size_t len mov x30,x3 // size_t bsz - ldp x0,x1,[x26,#16*0] - ldp x2,x3,[x26,#16*1] - ldp x4,x5,[x26,#16*2] - ldp x6,x7,[x26,#16*3] - ldp x8,x9,[x26,#16*4] - ldp x10,x11,[x26,#16*5] - ldp x12,x13,[x26,#16*6] - ldp x14,x15,[x26,#16*7] - ldp x16,x17,[x26,#16*8] - ldp x25,x19,[x26,#16*9] - ldp x20,x21,[x26,#16*10] - ldp x22,x23,[x26,#16*11] - ldr x24,[x26,#16*12] + ldp x0,x1,[PTR(26),#16*0] + ldp x2,x3,[PTR(26),#16*1] + ldp x4,x5,[PTR(26),#16*2] + ldp x6,x7,[PTR(26),#16*3] + ldp x8,x9,[PTR(26),#16*4] + ldp x10,x11,[PTR(26),#16*5] + ldp x12,x13,[PTR(26),#16*6] + ldp x14,x15,[PTR(26),#16*7] + ldp x16,x17,[PTR(26),#16*8] + ldp x25,x19,[PTR(26),#16*9] + ldp x20,x21,[PTR(26),#16*10] + ldp x22,x23,[PTR(26),#16*11] + ldr x24,[PTR(26),#16*12] b .Loop_absorb .align 4 @@ -303,203 +303,203 @@ SHA3_absorb: subs x26,x28,x30 // len - bsz blo .Labsorbed - str x26,[sp,#48] // save len - bsz - ldr x26,[x27],#8 // *inp++ + str x26,[PTRN(sp),#(2*PTR_WIDTH+32)] // save len - bsz + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x0,x0,x26 cmp x30,#8*(0+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x1,x1,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x2,x2,x26 cmp x30,#8*(2+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x3,x3,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x4,x4,x26 cmp x30,#8*(4+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x5,x5,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x6,x6,x26 cmp x30,#8*(6+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x7,x7,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x8,x8,x26 cmp x30,#8*(8+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x9,x9,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x10,x10,x26 cmp x30,#8*(10+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x11,x11,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x12,x12,x26 cmp x30,#8*(12+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x13,x13,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x14,x14,x26 cmp x30,#8*(14+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x15,x15,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x16,x16,x26 cmp x30,#8*(16+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x17,x17,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x25,x25,x26 cmp x30,#8*(18+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x19,x19,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x20,x20,x26 cmp x30,#8*(20+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x21,x21,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x22,x22,x26 cmp x30,#8*(22+2) blo .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x23,x23,x26 beq .Lprocess_block - ldr x26,[x27],#8 // *inp++ + ldr x26,[PTR(27)],#8 // *inp++ #ifdef __AARCH64EB__ rev x26,x26 #endif eor x24,x24,x26 .Lprocess_block: - str x27,[sp,#40] // save inp + str PTR(27),[PTRN(sp),#(PTR_WIDTH+32)] // save inp bl KeccakF1600_int - ldr x27,[sp,#40] // restore arguments - ldp x28,x30,[sp,#48] + ldr PTR(27),[PTRN(sp),#(PTR_WIDTH+32)] // restore arguments + ldp x28,x30,[PTRN(sp),#(2*PTR_WIDTH+32)] b .Loop_absorb .align 4 .Labsorbed: - ldr x27,[sp,#32] - stp x0,x1,[x27,#16*0] - stp x2,x3,[x27,#16*1] - stp x4,x5,[x27,#16*2] - stp x6,x7,[x27,#16*3] - stp x8,x9,[x27,#16*4] - stp x10,x11,[x27,#16*5] - stp x12,x13,[x27,#16*6] - stp x14,x15,[x27,#16*7] - stp x16,x17,[x27,#16*8] - stp x25,x19,[x27,#16*9] - stp x20,x21,[x27,#16*10] - stp x22,x23,[x27,#16*11] - str x24,[x27,#16*12] + ldr PTR(27),[PTRN(sp),#32] + stp x0,x1,[PTR(27),#16*0] + stp x2,x3,[PTR(27),#16*1] + stp x4,x5,[PTR(27),#16*2] + stp x6,x7,[PTR(27),#16*3] + stp x8,x9,[PTR(27),#16*4] + stp x10,x11,[PTR(27),#16*5] + stp x12,x13,[PTR(27),#16*6] + stp x14,x15,[PTR(27),#16*7] + stp x16,x17,[PTR(27),#16*8] + stp x25,x19,[PTR(27),#16*9] + stp x20,x21,[PTR(27),#16*10] + stp x22,x23,[PTR(27),#16*11] + str x24,[PTR(27),#16*12] mov x0,x28 // return value - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#(2*PTR_WIDTH+48) + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(16*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_absorb,.-SHA3_absorb @@ -508,68 +508,68 @@ SHA3_absorb: .align 5 SHA3_squeeze: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] + stp PTR(29),PTR(30),[PTRN(sp),#-(6*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] - mov x19,x0 // put aside arguments - mov x20,x1 + mov PTR(19),PTR(0) // put aside arguments + mov PTR(20),PTR(1) mov x21,x2 mov x22,x3 .Loop_squeeze: - ldr x4,[x0],#8 + ldr x4,[PTR(0)],#8 cmp x21,#8 blo .Lsqueeze_tail #ifdef __AARCH64EB__ rev x4,x4 #endif - str x4,[x20],#8 + str x4,[PTR(20)],#8 subs x21,x21,#8 beq .Lsqueeze_done subs x3,x3,#8 bhi .Loop_squeeze - mov x0,x19 + mov PTR(0),PTR(19) bl KeccakF1600 - mov x0,x19 + mov PTR(0),PTR(19) mov x3,x22 b .Loop_squeeze .align 4 .Lsqueeze_tail: - strb w4,[x20],#1 + strb w4,[PTR(20)],#1 lsr x4,x4,#8 subs x21,x21,#1 beq .Lsqueeze_done - strb w4,[x20],#1 + strb w4,[PTR(20)],#1 lsr x4,x4,#8 subs x21,x21,#1 beq .Lsqueeze_done - strb w4,[x20],#1 + strb w4,[PTR(20)],#1 lsr x4,x4,#8 subs x21,x21,#1 beq .Lsqueeze_done - strb w4,[x20],#1 + strb w4,[PTR(20)],#1 lsr x4,x4,#8 subs x21,x21,#1 beq .Lsqueeze_done - strb w4,[x20],#1 + strb w4,[PTR(20)],#1 lsr x4,x4,#8 subs x21,x21,#1 beq .Lsqueeze_done - strb w4,[x20],#1 + strb w4,[PTR(20)],#1 lsr x4,x4,#8 subs x21,x21,#1 beq .Lsqueeze_done - strb w4,[x20],#1 + strb w4,[PTR(20)],#1 .Lsqueeze_done: - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x29,x30,[sp],#48 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(6*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_squeeze,.-SHA3_squeeze @@ -577,7 +577,7 @@ SHA3_squeeze: .align 5 KeccakF1600_ce: mov x9,#24 - adr x10,iotas + adr PTR(10),iotas b .Loop_ce .align 4 .Loop_ce: @@ -641,7 +641,7 @@ KeccakF1600_ce: .inst 0xce3a62f7 //bcax v23.16b,v23.16b,v26.16b, v24.16b .inst 0xce286b18 //bcax v24.16b,v24.16b,v8.16b,v26.16b // A[1][3]=A[4][1] - ld1r {v26.2d},[x10],#8 + ld1r {v26.2d},[PTR(10)],#8 .inst 0xce330fd1 //bcax v17.16b,v30.16b, v19.16b,v3.16b // A[0][3]=A[3][3] .inst 0xce2f4c72 //bcax v18.16b,v3.16b,v15.16b,v19.16b // A[0][3]=A[3][3] @@ -679,46 +679,46 @@ KeccakF1600_ce: .align 5 KeccakF1600_cext: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp d8,d9,[sp,#16] // per ABI requirement - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - ldp d0,d1,[x0,#8*0] - ldp d2,d3,[x0,#8*2] - ldp d4,d5,[x0,#8*4] - ldp d6,d7,[x0,#8*6] - ldp d8,d9,[x0,#8*8] - ldp d10,d11,[x0,#8*10] - ldp d12,d13,[x0,#8*12] - ldp d14,d15,[x0,#8*14] - ldp d16,d17,[x0,#8*16] - ldp d18,d19,[x0,#8*18] - ldp d20,d21,[x0,#8*20] - ldp d22,d23,[x0,#8*22] - ldr d24,[x0,#8*24] + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH+64)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] // per ABI requirement + stp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + stp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + stp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] + ldp d0,d1,[PTR(0),#8*0] + ldp d2,d3,[PTR(0),#8*2] + ldp d4,d5,[PTR(0),#8*4] + ldp d6,d7,[PTR(0),#8*6] + ldp d8,d9,[PTR(0),#8*8] + ldp d10,d11,[PTR(0),#8*10] + ldp d12,d13,[PTR(0),#8*12] + ldp d14,d15,[PTR(0),#8*14] + ldp d16,d17,[PTR(0),#8*16] + ldp d18,d19,[PTR(0),#8*18] + ldp d20,d21,[PTR(0),#8*20] + ldp d22,d23,[PTR(0),#8*22] + ldr d24,[PTR(0),#8*24] bl KeccakF1600_ce - ldr x30,[sp,#8] - stp d0,d1,[x0,#8*0] - stp d2,d3,[x0,#8*2] - stp d4,d5,[x0,#8*4] - stp d6,d7,[x0,#8*6] - stp d8,d9,[x0,#8*8] - stp d10,d11,[x0,#8*10] - stp d12,d13,[x0,#8*12] - stp d14,d15,[x0,#8*14] - stp d16,d17,[x0,#8*16] - stp d18,d19,[x0,#8*18] - stp d20,d21,[x0,#8*20] - stp d22,d23,[x0,#8*22] - str d24,[x0,#8*24] - - ldp d8,d9,[sp,#16] - ldp d10,d11,[sp,#32] - ldp d12,d13,[sp,#48] - ldp d14,d15,[sp,#64] - ldr x29,[sp],#80 + ldr PTR(30),[PTRN(sp),#PTR_WIDTH] + stp d0,d1,[PTR(0),#8*0] + stp d2,d3,[PTR(0),#8*2] + stp d4,d5,[PTR(0),#8*4] + stp d6,d7,[PTR(0),#8*6] + stp d8,d9,[PTR(0),#8*8] + stp d10,d11,[PTR(0),#8*10] + stp d12,d13,[PTR(0),#8*12] + stp d14,d15,[PTR(0),#8*14] + stp d16,d17,[PTR(0),#8*16] + stp d18,d19,[PTR(0),#8*18] + stp d20,d21,[PTR(0),#8*20] + stp d22,d23,[PTR(0),#8*22] + str d24,[PTR(0),#8*24] + + ldp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] + ldp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + ldp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + ldp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH+64) AARCH64_VALIDATE_LINK_REGISTER ret .size KeccakF1600_cext,.-KeccakF1600_cext @@ -727,188 +727,188 @@ KeccakF1600_cext: .align 5 SHA3_absorb_cext: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp d8,d9,[sp,#16] // per ABI requirement - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - ldp d0,d1,[x0,#8*0] - ldp d2,d3,[x0,#8*2] - ldp d4,d5,[x0,#8*4] - ldp d6,d7,[x0,#8*6] - ldp d8,d9,[x0,#8*8] - ldp d10,d11,[x0,#8*10] - ldp d12,d13,[x0,#8*12] - ldp d14,d15,[x0,#8*14] - ldp d16,d17,[x0,#8*16] - ldp d18,d19,[x0,#8*18] - ldp d20,d21,[x0,#8*20] - ldp d22,d23,[x0,#8*22] - ldr d24,[x0,#8*24] + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH+64)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] // per ABI requirement + stp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + stp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + stp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] + ldp d0,d1,[PTR(0),#8*0] + ldp d2,d3,[PTR(0),#8*2] + ldp d4,d5,[PTR(0),#8*4] + ldp d6,d7,[PTR(0),#8*6] + ldp d8,d9,[PTR(0),#8*8] + ldp d10,d11,[PTR(0),#8*10] + ldp d12,d13,[PTR(0),#8*12] + ldp d14,d15,[PTR(0),#8*14] + ldp d16,d17,[PTR(0),#8*16] + ldp d18,d19,[PTR(0),#8*18] + ldp d20,d21,[PTR(0),#8*20] + ldp d22,d23,[PTR(0),#8*22] + ldr d24,[PTR(0),#8*24] b .Loop_absorb_ce .align 4 .Loop_absorb_ce: subs x2,x2,x3 // len - bsz blo .Labsorbed_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v0.16b,v0.16b,v31.16b cmp x3,#8*(0+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v1.16b,v1.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v2.16b,v2.16b,v31.16b cmp x3,#8*(2+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v3.16b,v3.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v4.16b,v4.16b,v31.16b cmp x3,#8*(4+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v5.16b,v5.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v6.16b,v6.16b,v31.16b cmp x3,#8*(6+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v7.16b,v7.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v8.16b,v8.16b,v31.16b cmp x3,#8*(8+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v9.16b,v9.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v10.16b,v10.16b,v31.16b cmp x3,#8*(10+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v11.16b,v11.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v12.16b,v12.16b,v31.16b cmp x3,#8*(12+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v13.16b,v13.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v14.16b,v14.16b,v31.16b cmp x3,#8*(14+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v15.16b,v15.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v16.16b,v16.16b,v31.16b cmp x3,#8*(16+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v17.16b,v17.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v18.16b,v18.16b,v31.16b cmp x3,#8*(18+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v19.16b,v19.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v20.16b,v20.16b,v31.16b cmp x3,#8*(20+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v21.16b,v21.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v22.16b,v22.16b,v31.16b cmp x3,#8*(22+2) blo .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif eor v23.16b,v23.16b,v31.16b beq .Lprocess_block_ce - ldr d31,[x1],#8 // *inp++ + ldr d31,[PTR(1)],#8 // *inp++ #ifdef __AARCH64EB__ rev64 v31.16b,v31.16b #endif @@ -922,26 +922,26 @@ SHA3_absorb_cext: .align 4 .Labsorbed_ce: - stp d0,d1,[x0,#8*0] - stp d2,d3,[x0,#8*2] - stp d4,d5,[x0,#8*4] - stp d6,d7,[x0,#8*6] - stp d8,d9,[x0,#8*8] - stp d10,d11,[x0,#8*10] - stp d12,d13,[x0,#8*12] - stp d14,d15,[x0,#8*14] - stp d16,d17,[x0,#8*16] - stp d18,d19,[x0,#8*18] - stp d20,d21,[x0,#8*20] - stp d22,d23,[x0,#8*22] - str d24,[x0,#8*24] + stp d0,d1,[PTR(0),#8*0] + stp d2,d3,[PTR(0),#8*2] + stp d4,d5,[PTR(0),#8*4] + stp d6,d7,[PTR(0),#8*6] + stp d8,d9,[PTR(0),#8*8] + stp d10,d11,[PTR(0),#8*10] + stp d12,d13,[PTR(0),#8*12] + stp d14,d15,[PTR(0),#8*14] + stp d16,d17,[PTR(0),#8*16] + stp d18,d19,[PTR(0),#8*18] + stp d20,d21,[PTR(0),#8*20] + stp d22,d23,[PTR(0),#8*22] + str d24,[PTR(0),#8*24] add x0,x2,x3 // return value - ldp d8,d9,[sp,#16] - ldp d10,d11,[sp,#32] - ldp d12,d13,[sp,#48] - ldp d14,d15,[sp,#64] - ldp x29,x30,[sp],#80 + ldp d8,d9,[PTRN(sp),#(2*PTR_WIDTH)] + ldp d10,d11,[PTRN(sp),#(2*PTR_WIDTH+16)] + ldp d12,d13,[PTRN(sp),#(2*PTR_WIDTH+32)] + ldp d14,d15,[PTRN(sp),#(2*PTR_WIDTH+48)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH+64) AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_absorb_cext,.-SHA3_absorb_cext @@ -950,19 +950,19 @@ SHA3_absorb_cext: .align 5 SHA3_squeeze_cext: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - mov x9,x0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + mov PTR(9),PTR(0) mov x10,x3 .Loop_squeeze_ce: - ldr x4,[x9],#8 + ldr x4,[PTR(9)],#8 cmp x2,#8 blo .Lsqueeze_tail_ce #ifdef __AARCH64EB__ rev x4,x4 #endif - str x4,[x1],#8 + str x4,[PTR(1)],#8 beq .Lsqueeze_done_ce sub x2,x2,#8 @@ -970,41 +970,41 @@ SHA3_squeeze_cext: bhi .Loop_squeeze_ce bl KeccakF1600_cext - ldr x30,[sp,#8] - mov x9,x0 + ldr PTR(30),[PTRN(sp),#PTR_WIDTH] + mov PTR(9),PTR(0) mov x10,x3 b .Loop_squeeze_ce .align 4 .Lsqueeze_tail_ce: - strb w4,[x1],#1 + strb w4,[PTR(1)],#1 lsr x4,x4,#8 subs x2,x2,#1 beq .Lsqueeze_done_ce - strb w4,[x1],#1 + strb w4,[PTR(1)],#1 lsr x4,x4,#8 subs x2,x2,#1 beq .Lsqueeze_done_ce - strb w4,[x1],#1 + strb w4,[PTR(1)],#1 lsr x4,x4,#8 subs x2,x2,#1 beq .Lsqueeze_done_ce - strb w4,[x1],#1 + strb w4,[PTR(1)],#1 lsr x4,x4,#8 subs x2,x2,#1 beq .Lsqueeze_done_ce - strb w4,[x1],#1 + strb w4,[PTR(1)],#1 lsr x4,x4,#8 subs x2,x2,#1 beq .Lsqueeze_done_ce - strb w4,[x1],#1 + strb w4,[PTR(1)],#1 lsr x4,x4,#8 subs x2,x2,#1 beq .Lsqueeze_done_ce - strb w4,[x1],#1 + strb w4,[PTR(1)],#1 .Lsqueeze_done_ce: - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size SHA3_squeeze_cext,.-SHA3_squeeze_cext diff --git a/sys/crypto/openssl/aarch64/poly1305-armv8.S b/sys/crypto/openssl/aarch64/poly1305-armv8.S index 8925984c3ee0..3b63ba4e943e 100644 --- a/sys/crypto/openssl/aarch64/poly1305-armv8.S +++ b/sys/crypto/openssl/aarch64/poly1305-armv8.S @@ -18,16 +18,16 @@ poly1305_init: AARCH64_VALID_CALL_TARGET cmp x1,xzr - stp xzr,xzr,[x0] // zero hash value - stp xzr,xzr,[x0,#16] // [along with is_base2_26] + stp xzr,xzr,[PTR(0)] // zero hash value + stp xzr,xzr,[PTR(0),#16] // [along with is_base2_26] - csel x0,xzr,x0,eq + csel PTR(0),PTR(zr),PTR(0),eq b.eq .Lno_key - adrp x17,OPENSSL_armcap_P - ldr w17,[x17,#:lo12:OPENSSL_armcap_P] + adrp PTR(17),OPENSSL_armcap_P + ldr w17,[PTR(17),#:lo12:OPENSSL_armcap_P] - ldp x7,x8,[x1] // load key + ldp x7,x8,[PTR(1)] // load key mov x9,#0xfffffffc0fffffff movk x9,#0x0fff,lsl#48 #ifdef __AARCH64EB__ @@ -37,22 +37,22 @@ poly1305_init: and x7,x7,x9 // &=0ffffffc0fffffff and x9,x9,#-4 and x8,x8,x9 // &=0ffffffc0ffffffc - stp x7,x8,[x0,#32] // save key value + stp x7,x8,[PTR(0),#32] // save key value tst w17,#ARMV7_NEON - adr x12,.Lpoly1305_blocks - adr x7,.Lpoly1305_blocks_neon - adr x13,.Lpoly1305_emit - adr x8,.Lpoly1305_emit_neon + adr PTR(12),.Lpoly1305_blocks + adr PTR(7),.Lpoly1305_blocks_neon + adr PTR(13),.Lpoly1305_emit + adr PTR(8),.Lpoly1305_emit_neon - csel x12,x12,x7,eq - csel x13,x13,x8,eq + csel PTR(12),PTR(12),PTR(7),eq + csel PTR(13),PTR(13),PTR(8),eq #ifdef __ILP32__ - stp w12,w13,[x2] + stp w12,w13,[PTR(2)] #else - stp x12,x13,[x2] + stp PTR(12),PTR(13),[PTR(2)] #endif mov x0,#1 @@ -61,6 +61,7 @@ poly1305_init: .size poly1305_init,.-poly1305_init .type poly1305_blocks,%function +.type .Lpoly1305_blocks,%function .align 5 poly1305_blocks: .Lpoly1305_blocks: @@ -70,15 +71,15 @@ poly1305_blocks: ands x2,x2,#-16 b.eq .Lno_data - ldp x4,x5,[x0] // load hash value - ldp x7,x8,[x0,#32] // load key value - ldr x6,[x0,#16] + ldp x4,x5,[PTR(0)] // load hash value + ldp x7,x8,[PTR(0),#32] // load key value + ldr x6,[PTR(0),#16] add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) b .Loop .align 5 .Loop: - ldp x10,x11,[x1],#16 // load input + ldp x10,x11,[PTR(1)],#16 // load input sub x2,x2,#16 #ifdef __AARCH64EB__ rev x10,x10 @@ -121,23 +122,24 @@ poly1305_blocks: cbnz x2,.Loop - stp x4,x5,[x0] // store hash value - str x6,[x0,#16] + stp x4,x5,[PTR(0)] // store hash value + str x6,[PTR(0),#16] .Lno_data: ret .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,%function +.type .Lpoly1305_emit,%function .align 5 poly1305_emit: .Lpoly1305_emit: // The symbol .poly1305_emit is not a .globl symbol // but a pointer to it is returned by poly1305_init AARCH64_VALID_CALL_TARGET - ldp x4,x5,[x0] // load hash base 2^64 - ldr x6,[x0,#16] - ldp x10,x11,[x2] // load nonce + ldp x4,x5,[PTR(0)] // load hash base 2^64 + ldr x6,[PTR(0),#16] + ldp x10,x11,[PTR(2)] // load nonce adds x12,x4,#5 // compare to modulus adcs x13,x5,xzr @@ -158,7 +160,7 @@ poly1305_emit: rev x4,x4 // flip output bytes rev x5,x5 #endif - stp x4,x5,[x1] // write result + stp x4,x5,[PTR(1)] // write result ret .size poly1305_emit,.-poly1305_emit @@ -209,53 +211,54 @@ poly1305_splat: ubfx x15,x5,#14,#26 extr x16,x6,x5,#40 - str w12,[x0,#16*0] // r0 + str w12,[PTR(0),#16*0] // r0 add w12,w13,w13,lsl#2 // r1*5 - str w13,[x0,#16*1] // r1 + str w13,[PTR(0),#16*1] // r1 add w13,w14,w14,lsl#2 // r2*5 - str w12,[x0,#16*2] // s1 - str w14,[x0,#16*3] // r2 + str w12,[PTR(0),#16*2] // s1 + str w14,[PTR(0),#16*3] // r2 add w14,w15,w15,lsl#2 // r3*5 - str w13,[x0,#16*4] // s2 - str w15,[x0,#16*5] // r3 + str w13,[PTR(0),#16*4] // s2 + str w15,[PTR(0),#16*5] // r3 add w15,w16,w16,lsl#2 // r4*5 - str w14,[x0,#16*6] // s3 - str w16,[x0,#16*7] // r4 - str w15,[x0,#16*8] // s4 + str w14,[PTR(0),#16*6] // s3 + str w16,[PTR(0),#16*7] // r4 + str w15,[PTR(0),#16*8] // s4 ret .size poly1305_splat,.-poly1305_splat .type poly1305_blocks_neon,%function +.type .Lpoly1305_blocks_neon,%function .align 5 poly1305_blocks_neon: .Lpoly1305_blocks_neon: // The symbol .Lpoly1305_blocks_neon is not a .globl symbol // but a pointer to it is returned by poly1305_init AARCH64_VALID_CALL_TARGET - ldr x17,[x0,#24] + ldr x17,[PTR(0),#24] cmp x2,#128 b.hs .Lblocks_neon cbz x17,.Lpoly1305_blocks .Lblocks_neon: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH+64)]! + add PTR(29),PTRN(sp),#0 ands x2,x2,#-16 b.eq .Lno_data_neon cbz x17,.Lbase2_64_neon - ldp w10,w11,[x0] // load hash value base 2^26 - ldp w12,w13,[x0,#8] - ldr w14,[x0,#16] + ldp w10,w11,[PTR(0)] // load hash value base 2^26 + ldp w12,w13,[PTR(0),#8] + ldr w14,[PTR(0),#16] tst x2,#31 b.eq .Leven_neon - ldp x7,x8,[x0,#32] // load key value + ldp x7,x8,[PTR(0),#32] // load key value add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 lsr x5,x12,#12 @@ -266,7 +269,7 @@ poly1305_blocks_neon: adds x5,x5,x14,lsl#40 adc x14,x6,xzr // can be partially reduced... - ldp x12,x13,[x1],#16 // load input + ldp x12,x13,[PTR(1)],#16 // load input sub x2,x2,#16 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) @@ -286,7 +289,7 @@ poly1305_blocks_neon: adc x6,x6,x3 bl poly1305_mult - ldr x30,[sp,#8] + ldr PTR(30),[PTRN(sp),#PTR_WIDTH] cbz x3,.Lstore_base2_64_neon @@ -299,28 +302,28 @@ poly1305_blocks_neon: cbnz x2,.Leven_neon - stp w10,w11,[x0] // store hash value base 2^26 - stp w12,w13,[x0,#8] - str w14,[x0,#16] + stp w10,w11,[PTR(0)] // store hash value base 2^26 + stp w12,w13,[PTR(0),#8] + str w14,[PTR(0),#16] b .Lno_data_neon .align 4 .Lstore_base2_64_neon: - stp x4,x5,[x0] // store hash value base 2^64 - stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed + stp x4,x5,[PTR(0)] // store hash value base 2^64 + stp x6,xzr,[PTR(0),#16] // note that is_base2_26 is zeroed b .Lno_data_neon .align 4 .Lbase2_64_neon: - ldp x7,x8,[x0,#32] // load key value + ldp x7,x8,[PTR(0),#32] // load key value - ldp x4,x5,[x0] // load hash value base 2^64 - ldr x6,[x0,#16] + ldp x4,x5,[PTR(0)] // load hash value base 2^64 + ldr x6,[PTR(0),#16] tst x2,#31 b.eq .Linit_neon - ldp x12,x13,[x1],#16 // load input + ldp x12,x13,[PTR(1)],#16 // load input sub x2,x2,#16 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) #ifdef __AARCH64EB__ @@ -341,10 +344,10 @@ poly1305_blocks_neon: ubfx x13,x5,#14,#26 extr x14,x6,x5,#40 - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] + stp d8,d9,[PTRN(sp),#2*PTR_WIDTH] // meet ABI requirements + stp d10,d11,[PTRN(sp),#2*PTR_WIDTH+16] + stp d12,d13,[PTRN(sp),#2*PTR_WIDTH+32] + stp d14,d15,[PTRN(sp),#2*PTR_WIDTH+48] fmov d24,x10 fmov d25,x11 @@ -357,43 +360,43 @@ poly1305_blocks_neon: add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) mov x5,x8 mov x6,xzr - add x0,x0,#48+12 + add PTR(0),PTR(0),#48+12 bl poly1305_splat bl poly1305_mult // r^2 - sub x0,x0,#4 + sub PTR(0),PTR(0),#4 bl poly1305_splat bl poly1305_mult // r^3 - sub x0,x0,#4 + sub PTR(0),PTR(0),#4 bl poly1305_splat bl poly1305_mult // r^4 - sub x0,x0,#4 + sub PTR(0),PTR(0),#4 bl poly1305_splat - ldr x30,[sp,#8] + ldr PTR(30),[PTRN(sp),#PTR_WIDTH] - add x16,x1,#32 - adr x17,.Lzeros + add PTR(16),PTR(1),#32 + adr PTR(17),.Lzeros subs x2,x2,#64 - csel x16,x17,x16,lo + csel PTR(16),PTR(17),PTR(16),lo mov x4,#1 - stur x4,[x0,#-24] // set is_base2_26 - sub x0,x0,#48 // restore original x0 + stur x4,[PTR(0),#-24] // set is_base2_26 + sub PTR(0),PTR(0),#48 // restore original PTR(0) b .Ldo_neon .align 4 .Leven_neon: - add x16,x1,#32 - adr x17,.Lzeros + add PTR(16),PTR(1),#32 + adr PTR(17),.Lzeros subs x2,x2,#64 - csel x16,x17,x16,lo + csel PTR(16),PTR(17),PTR(16),lo - stp d8,d9,[sp,#16] // meet ABI requirements - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] + stp d8,d9,[PTRN(sp),#2*PTR_WIDTH] // meet ABI requirements + stp d10,d11,[PTRN(sp),#2*PTR_WIDTH+16] + stp d12,d13,[PTRN(sp),#2*PTR_WIDTH+32] + stp d14,d15,[PTRN(sp),#2*PTR_WIDTH+48] fmov d24,x10 fmov d25,x11 @@ -402,11 +405,11 @@ poly1305_blocks_neon: fmov d28,x14 .Ldo_neon: - ldp x8,x12,[x16],#16 // inp[2:3] (or zero) - ldp x9,x13,[x16],#48 + ldp x8,x12,[PTR(16)],#16 // inp[2:3] (or zero) + ldp x9,x13,[PTR(16)],#48 lsl x3,x3,#24 - add x15,x0,#48 + add PTR(15),PTR(0),#48 #ifdef __AARCH64EB__ rev x8,x8 @@ -437,12 +440,12 @@ poly1305_blocks_neon: fmov d17,x10 fmov d18,x12 - ldp x8,x12,[x1],#16 // inp[0:1] - ldp x9,x13,[x1],#48 + ldp x8,x12,[PTR(1)],#16 // inp[0:1] + ldp x9,x13,[PTR(1)],#48 - ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 - ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 - ld1 {v8.4s},[x15] + ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[PTR(15)],#64 + ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[PTR(15)],#64 + ld1 {v8.4s},[PTR(15)] #ifdef __AARCH64EB__ rev x8,x8 @@ -498,12 +501,12 @@ poly1305_blocks_neon: subs x2,x2,#64 umull v23.2d,v14.2s,v7.s[2] - csel x16,x17,x16,lo + csel PTR(16),PTR(17),PTR(16),lo umull v22.2d,v14.2s,v5.s[2] umull v21.2d,v14.2s,v3.s[2] - ldp x8,x12,[x16],#16 // inp[2:3] (or zero) + ldp x8,x12,[PTR(16)],#16 // inp[2:3] (or zero) umull v20.2d,v14.2s,v1.s[2] - ldp x9,x13,[x16],#48 + ldp x9,x13,[PTR(16)],#48 umull v19.2d,v14.2s,v0.s[2] #ifdef __AARCH64EB__ rev x8,x8 @@ -564,9 +567,9 @@ poly1305_blocks_neon: add v9.2s,v9.2s,v24.2s fmov d18,x12 umlal v22.2d,v11.2s,v1.s[0] - ldp x8,x12,[x1],#16 // inp[0:1] + ldp x8,x12,[PTR(1)],#16 // inp[0:1] umlal v19.2d,v11.2s,v6.s[0] - ldp x9,x13,[x1],#48 + ldp x9,x13,[PTR(1)],#48 umlal v23.2d,v11.2s,v3.s[0] umlal v20.2d,v11.2s,v8.s[0] umlal v21.2d,v11.2s,v0.s[0] @@ -761,13 +764,13 @@ poly1305_blocks_neon: // horizontal add addp v22.2d,v22.2d,v22.2d - ldp d8,d9,[sp,#16] // meet ABI requirements + ldp d8,d9,[PTRN(sp),#2*PTR_WIDTH] // meet ABI requirements addp v19.2d,v19.2d,v19.2d - ldp d10,d11,[sp,#32] + ldp d10,d11,[PTRN(sp),#2*PTR_WIDTH+16] addp v23.2d,v23.2d,v23.2d - ldp d12,d13,[sp,#48] + ldp d12,d13,[PTRN(sp),#2*PTR_WIDTH+32] addp v20.2d,v20.2d,v20.2d - ldp d14,d15,[sp,#64] + ldp d14,d15,[PTRN(sp),#2*PTR_WIDTH+48] addp v21.2d,v21.2d,v21.2d //////////////////////////////////////////////////////////////// @@ -804,28 +807,29 @@ poly1305_blocks_neon: //////////////////////////////////////////////////////////////// // write the result, can be partially reduced - st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 - st1 {v23.s}[0],[x0] + st4 {v19.s,v20.s,v21.s,v22.s}[0],[PTR(0)],#16 + st1 {v23.s}[0],[PTR(0)] .Lno_data_neon: - ldr x29,[sp],#80 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH+64) AARCH64_VALIDATE_LINK_REGISTER ret .size poly1305_blocks_neon,.-poly1305_blocks_neon .type poly1305_emit_neon,%function +.type .Lpoly1305_emit_neon,%function .align 5 poly1305_emit_neon: .Lpoly1305_emit_neon: // The symbol .Lpoly1305_emit_neon is not a .globl symbol // but a pointer to it is returned by poly1305_init AARCH64_VALID_CALL_TARGET - ldr x17,[x0,#24] + ldr x17,[PTR(0),#24] cbz x17,poly1305_emit - ldp w10,w11,[x0] // load hash value base 2^26 - ldp w12,w13,[x0,#8] - ldr w14,[x0,#16] + ldp w10,w11,[PTR(0)] // load hash value base 2^26 + ldp w12,w13,[PTR(0),#8] + ldr w14,[PTR(0),#16] add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 lsr x5,x12,#12 @@ -836,7 +840,7 @@ poly1305_emit_neon: adds x5,x5,x14,lsl#40 adc x6,x6,xzr // can be partially reduced... - ldp x10,x11,[x2] // load nonce + ldp x10,x11,[PTR(2)] // load nonce and x12,x6,#-4 // ... so reduce add x12,x12,x6,lsr#2 @@ -864,7 +868,7 @@ poly1305_emit_neon: rev x4,x4 // flip output bytes rev x5,x5 #endif - stp x4,x5,[x1] // write result + stp x4,x5,[PTR(1)] // write result ret .size poly1305_emit_neon,.-poly1305_emit_neon diff --git a/sys/crypto/openssl/aarch64/sha1-armv8.S b/sys/crypto/openssl/aarch64/sha1-armv8.S index 9e2d86072394..7f7ef9461df5 100644 --- a/sys/crypto/openssl/aarch64/sha1-armv8.S +++ b/sys/crypto/openssl/aarch64/sha1-armv8.S @@ -12,26 +12,26 @@ .align 6 sha1_block_data_order: AARCH64_VALID_CALL_TARGET - adrp x16,OPENSSL_armcap_P - ldr w16,[x16,#:lo12:OPENSSL_armcap_P] + adrp PTR(16),OPENSSL_armcap_P + ldr w16,[PTR(16),#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA1 b.ne .Lv8_entry // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] + stp PTR(29),PTR(30),[PTRN(sp),#-(12*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] - ldp w20,w21,[x0] - ldp w22,w23,[x0,#8] - ldr w24,[x0,#16] + ldp w20,w21,[PTR(0)] + ldp w22,w23,[PTR(0),#8] + ldr w24,[PTR(0),#16] .Loop: - ldr x3,[x1],#64 + ldr x3,[PTR(1)],#64 movz w28,#0x7999 sub x2,x2,#1 movk w28,#0x5a82,lsl#16 @@ -43,7 +43,7 @@ sha1_block_data_order: add w24,w24,w28 // warm it up add w24,w24,w3 lsr x4,x3,#32 - ldur x5,[x1,#-56] + ldur x5,[PTR(1),#-56] bic w25,w23,w21 and w26,w22,w21 ror w27,w20,#27 @@ -68,7 +68,7 @@ sha1_block_data_order: add w22,w22,w5 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) lsr x6,x5,#32 - ldur x7,[x1,#-48] + ldur x7,[PTR(1),#-48] bic w25,w21,w24 and w26,w20,w24 ror w27,w23,#27 @@ -93,7 +93,7 @@ sha1_block_data_order: add w20,w20,w7 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) lsr x8,x7,#32 - ldur x9,[x1,#-40] + ldur x9,[PTR(1),#-40] bic w25,w24,w22 and w26,w23,w22 ror w27,w21,#27 @@ -118,7 +118,7 @@ sha1_block_data_order: add w23,w23,w9 // future e+=X[i] add w24,w24,w25 // e+=F(b,c,d) lsr x10,x9,#32 - ldur x11,[x1,#-32] + ldur x11,[PTR(1),#-32] bic w25,w22,w20 and w26,w21,w20 ror w27,w24,#27 @@ -143,7 +143,7 @@ sha1_block_data_order: add w21,w21,w11 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) lsr x12,x11,#32 - ldur x13,[x1,#-24] + ldur x13,[PTR(1),#-24] bic w25,w20,w23 and w26,w24,w23 ror w27,w22,#27 @@ -168,7 +168,7 @@ sha1_block_data_order: add w24,w24,w13 // future e+=X[i] add w20,w20,w25 // e+=F(b,c,d) lsr x14,x13,#32 - ldur x15,[x1,#-16] + ldur x15,[PTR(1),#-16] bic w25,w23,w21 and w26,w22,w21 ror w27,w20,#27 @@ -193,7 +193,7 @@ sha1_block_data_order: add w22,w22,w15 // future e+=X[i] add w23,w23,w25 // e+=F(b,c,d) lsr x16,x15,#32 - ldur x17,[x1,#-8] + ldur x17,[PTR(1),#-8] bic w25,w21,w24 and w26,w20,w24 ror w27,w23,#27 @@ -1038,7 +1038,7 @@ sha1_block_data_order: add w21,w21,w17 // future e+=X[i] add w22,w22,w25 // e+=F(b,c,d) ror w19,w19,#31 - ldp w4,w5,[x0] + ldp w4,w5,[PTR(0)] eor w25,w20,w23 ror w27,w22,#27 add w20,w20,w28 // future e+=K @@ -1047,30 +1047,30 @@ sha1_block_data_order: ror w23,w23,#2 add w20,w20,w19 // future e+=X[i] add w21,w21,w25 // e+=F(b,c,d) - ldp w6,w7,[x0,#8] + ldp w6,w7,[PTR(0),#8] eor w25,w24,w22 ror w27,w21,#27 eor w25,w25,w23 add w20,w20,w27 // e+=rot(a,5) ror w22,w22,#2 - ldr w8,[x0,#16] + ldr w8,[PTR(0),#16] add w20,w20,w25 // e+=F(b,c,d) add w21,w21,w5 add w22,w22,w6 add w20,w20,w4 add w23,w23,w7 add w24,w24,w8 - stp w20,w21,[x0] - stp w22,w23,[x0,#8] - str w24,[x0,#16] + stp w20,w21,[PTR(0)] + stp w22,w23,[PTR(0),#8] + str w24,[PTR(0),#16] cbnz x2,.Loop - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldp x25,x26,[sp,#64] - ldp x27,x28,[sp,#80] - ldr x29,[sp],#96 + ldp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + ldp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + ldr PTR(29),[PTRN(sp)],#(12*PTR_WIDTH) ret .size sha1_block_data_order,.-sha1_block_data_order .type sha1_block_armv8,%function @@ -1078,18 +1078,18 @@ sha1_block_data_order: sha1_block_armv8: .Lv8_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - adr x4,.Lconst + adr PTR(4),.Lconst eor v1.16b,v1.16b,v1.16b - ld1 {v0.4s},[x0],#16 - ld1 {v1.s}[0],[x0] - sub x0,x0,#16 - ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] + ld1 {v0.4s},[PTR(0)],#16 + ld1 {v1.s}[0],[PTR(0)] + sub PTR(0),PTR(0),#16 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[PTR(4)] .Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[PTR(1)],#64 sub x2,x2,#1 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b @@ -1198,10 +1198,10 @@ sha1_block_armv8: cbnz x2,.Loop_hw - st1 {v0.4s},[x0],#16 - st1 {v1.s}[0],[x0] + st1 {v0.4s},[PTR(0)],#16 + st1 {v1.s}[0],[PTR(0)] - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size sha1_block_armv8,.-sha1_block_armv8 .align 6 diff --git a/sys/crypto/openssl/aarch64/sha256-armv8.S b/sys/crypto/openssl/aarch64/sha256-armv8.S index 4f3934a4890c..ed9f6b94b453 100644 --- a/sys/crypto/openssl/aarch64/sha256-armv8.S +++ b/sys/crypto/openssl/aarch64/sha256-armv8.S @@ -70,37 +70,42 @@ sha256_block_data_order: AARCH64_VALID_CALL_TARGET #ifndef __KERNEL__ - adrp x16,OPENSSL_armcap_P - ldr w16,[x16,#:lo12:OPENSSL_armcap_P] + adrp PTR(16),OPENSSL_armcap_P + ldr w16,[PTR(16),#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA256 b.ne .Lv8_entry tst w16,#ARMV7_NEON b.ne .Lneon_entry #endif AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*4 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#4*4 - ldp w20,w21,[x0] // load context - ldp w22,w23,[x0,#2*4] - ldp w24,w25,[x0,#4*4] - add x2,x1,x2,lsl#6 // end of input - ldp w26,w27,[x0,#6*4] - adr x30,.LK256 - stp x0,x2,[x29,#96] + ldp w20,w21,[PTR(0)] // load context + ldp w22,w23,[PTR(0),#2*4] + ldp w24,w25,[PTR(0),#4*4] +#ifdef __CHERI_PURE_CAPABILITY__ + lsl x17,x2,#6 + add PTR(2),PTR(1),x17 // end of input +#else + add PTR(2),PTR(1),PTR(2),lsl#6 // end of input +#endif + ldp w26,w27,[PTR(0),#6*4] + adr PTR(30),.LK256 + stp PTR(0),PTR(2),[PTR(29),#(12*PTR_WIDTH)] .Loop: - ldp w3,w4,[x1],#2*4 - ldr w19,[x30],#4 // *K++ + ldp w3,w4,[PTR(1)],#2*4 + ldr w19,[PTR(30)],#4 // *K++ eor w28,w21,w22 // magic seed - str x1,[x29,#112] + str PTR(1),[PTR(29),#(14*PTR_WIDTH)] #ifndef __AARCH64EB__ rev w3,w3 // 0 #endif @@ -122,12 +127,12 @@ sha256_block_data_order: eor w28,w28,w21 // Maj(a,b,c) eor w17,w6,w17,ror#13 // Sigma0(a) add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round //add w27,w27,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w4,w4 // 1 #endif - ldp w5,w6,[x1],#2*4 + ldp w5,w6,[PTR(1)],#2*4 add w27,w27,w17 // h+=Sigma0(a) ror w16,w23,#6 add w26,w26,w28 // h+=K[i] @@ -147,7 +152,7 @@ sha256_block_data_order: eor w19,w19,w20 // Maj(a,b,c) eor w17,w7,w17,ror#13 // Sigma0(a) add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round //add w26,w26,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w5,w5 // 2 @@ -171,12 +176,12 @@ sha256_block_data_order: eor w28,w28,w27 // Maj(a,b,c) eor w17,w8,w17,ror#13 // Sigma0(a) add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round //add w25,w25,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w6,w6 // 3 #endif - ldp w7,w8,[x1],#2*4 + ldp w7,w8,[PTR(1)],#2*4 add w25,w25,w17 // h+=Sigma0(a) ror w16,w21,#6 add w24,w24,w28 // h+=K[i] @@ -196,7 +201,7 @@ sha256_block_data_order: eor w19,w19,w26 // Maj(a,b,c) eor w17,w9,w17,ror#13 // Sigma0(a) add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round //add w24,w24,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w7,w7 // 4 @@ -220,12 +225,12 @@ sha256_block_data_order: eor w28,w28,w25 // Maj(a,b,c) eor w17,w10,w17,ror#13 // Sigma0(a) add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round //add w23,w23,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w8,w8 // 5 #endif - ldp w9,w10,[x1],#2*4 + ldp w9,w10,[PTR(1)],#2*4 add w23,w23,w17 // h+=Sigma0(a) ror w16,w27,#6 add w22,w22,w28 // h+=K[i] @@ -245,7 +250,7 @@ sha256_block_data_order: eor w19,w19,w24 // Maj(a,b,c) eor w17,w11,w17,ror#13 // Sigma0(a) add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round //add w22,w22,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w9,w9 // 6 @@ -269,12 +274,12 @@ sha256_block_data_order: eor w28,w28,w23 // Maj(a,b,c) eor w17,w12,w17,ror#13 // Sigma0(a) add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round //add w21,w21,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w10,w10 // 7 #endif - ldp w11,w12,[x1],#2*4 + ldp w11,w12,[PTR(1)],#2*4 add w21,w21,w17 // h+=Sigma0(a) ror w16,w25,#6 add w20,w20,w28 // h+=K[i] @@ -294,7 +299,7 @@ sha256_block_data_order: eor w19,w19,w22 // Maj(a,b,c) eor w17,w13,w17,ror#13 // Sigma0(a) add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round //add w20,w20,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w11,w11 // 8 @@ -318,12 +323,12 @@ sha256_block_data_order: eor w28,w28,w21 // Maj(a,b,c) eor w17,w14,w17,ror#13 // Sigma0(a) add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round //add w27,w27,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w12,w12 // 9 #endif - ldp w13,w14,[x1],#2*4 + ldp w13,w14,[PTR(1)],#2*4 add w27,w27,w17 // h+=Sigma0(a) ror w16,w23,#6 add w26,w26,w28 // h+=K[i] @@ -343,7 +348,7 @@ sha256_block_data_order: eor w19,w19,w20 // Maj(a,b,c) eor w17,w15,w17,ror#13 // Sigma0(a) add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round //add w26,w26,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w13,w13 // 10 @@ -367,14 +372,14 @@ sha256_block_data_order: eor w28,w28,w27 // Maj(a,b,c) eor w17,w0,w17,ror#13 // Sigma0(a) add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round //add w25,w25,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w14,w14 // 11 #endif - ldp w15,w0,[x1],#2*4 + ldp w15,w0,[PTR(1)],#2*4 add w25,w25,w17 // h+=Sigma0(a) - str w6,[sp,#12] + str w6,[PTRN(sp),#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] eor w6,w21,w21,ror#14 @@ -393,13 +398,13 @@ sha256_block_data_order: eor w19,w19,w26 // Maj(a,b,c) eor w17,w6,w17,ror#13 // Sigma0(a) add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round //add w24,w24,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w15,w15 // 12 #endif add w24,w24,w17 // h+=Sigma0(a) - str w7,[sp,#0] + str w7,[PTRN(sp),#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] eor w7,w20,w20,ror#14 @@ -418,14 +423,14 @@ sha256_block_data_order: eor w28,w28,w25 // Maj(a,b,c) eor w17,w7,w17,ror#13 // Sigma0(a) add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round //add w23,w23,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w0,w0 // 13 #endif - ldp w1,w2,[x1] + ldp w1,w2,[PTR(1)] add w23,w23,w17 // h+=Sigma0(a) - str w8,[sp,#4] + str w8,[PTRN(sp),#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] eor w8,w27,w27,ror#14 @@ -444,14 +449,14 @@ sha256_block_data_order: eor w19,w19,w24 // Maj(a,b,c) eor w17,w8,w17,ror#13 // Sigma0(a) add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round //add w22,w22,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w1,w1 // 14 #endif - ldr w6,[sp,#12] + ldr w6,[PTRN(sp),#12] add w22,w22,w17 // h+=Sigma0(a) - str w9,[sp,#8] + str w9,[PTRN(sp),#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] eor w9,w26,w26,ror#14 @@ -470,14 +475,14 @@ sha256_block_data_order: eor w28,w28,w23 // Maj(a,b,c) eor w17,w9,w17,ror#13 // Sigma0(a) add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round //add w21,w21,w17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev w2,w2 // 15 #endif - ldr w7,[sp,#0] + ldr w7,[PTRN(sp),#0] add w21,w21,w17 // h+=Sigma0(a) - str w10,[sp,#12] + str w10,[PTRN(sp),#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w9,w4,#7 @@ -503,13 +508,13 @@ sha256_block_data_order: add w3,w3,w12 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w3,w3,w9 add w20,w20,w17 // h+=Sigma0(a) add w3,w3,w8 .Loop_16_xx: - ldr w8,[sp,#4] - str w11,[sp,#0] + ldr w8,[PTRN(sp),#4] + str w11,[PTRN(sp),#0] ror w16,w24,#6 add w27,w27,w19 // h+=K[i] ror w10,w5,#7 @@ -535,12 +540,12 @@ sha256_block_data_order: add w4,w4,w13 add w23,w23,w27 // d+=h add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round add w4,w4,w10 add w27,w27,w17 // h+=Sigma0(a) add w4,w4,w9 - ldr w9,[sp,#8] - str w12,[sp,#4] + ldr w9,[PTRN(sp),#8] + str w12,[PTRN(sp),#4] ror w16,w23,#6 add w26,w26,w28 // h+=K[i] ror w11,w6,#7 @@ -566,12 +571,12 @@ sha256_block_data_order: add w5,w5,w14 add w22,w22,w26 // d+=h add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w5,w5,w11 add w26,w26,w17 // h+=Sigma0(a) add w5,w5,w10 - ldr w10,[sp,#12] - str w13,[sp,#8] + ldr w10,[PTRN(sp),#12] + str w13,[PTRN(sp),#8] ror w16,w22,#6 add w25,w25,w19 // h+=K[i] ror w12,w7,#7 @@ -597,12 +602,12 @@ sha256_block_data_order: add w6,w6,w15 add w21,w21,w25 // d+=h add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round add w6,w6,w12 add w25,w25,w17 // h+=Sigma0(a) add w6,w6,w11 - ldr w11,[sp,#0] - str w14,[sp,#12] + ldr w11,[PTRN(sp),#0] + str w14,[PTRN(sp),#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] ror w13,w8,#7 @@ -628,12 +633,12 @@ sha256_block_data_order: add w7,w7,w0 add w20,w20,w24 // d+=h add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w7,w7,w13 add w24,w24,w17 // h+=Sigma0(a) add w7,w7,w12 - ldr w12,[sp,#4] - str w15,[sp,#0] + ldr w12,[PTRN(sp),#4] + str w15,[PTRN(sp),#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] ror w14,w9,#7 @@ -659,12 +664,12 @@ sha256_block_data_order: add w8,w8,w1 add w27,w27,w23 // d+=h add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round add w8,w8,w14 add w23,w23,w17 // h+=Sigma0(a) add w8,w8,w13 - ldr w13,[sp,#8] - str w0,[sp,#4] + ldr w13,[PTRN(sp),#8] + str w0,[PTRN(sp),#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] ror w15,w10,#7 @@ -690,12 +695,12 @@ sha256_block_data_order: add w9,w9,w2 add w26,w26,w22 // d+=h add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w9,w9,w15 add w22,w22,w17 // h+=Sigma0(a) add w9,w9,w14 - ldr w14,[sp,#12] - str w1,[sp,#8] + ldr w14,[PTRN(sp),#12] + str w1,[PTRN(sp),#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] ror w0,w11,#7 @@ -721,12 +726,12 @@ sha256_block_data_order: add w10,w10,w3 add w25,w25,w21 // d+=h add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round add w10,w10,w0 add w21,w21,w17 // h+=Sigma0(a) add w10,w10,w15 - ldr w15,[sp,#0] - str w2,[sp,#12] + ldr w15,[PTRN(sp),#0] + str w2,[PTRN(sp),#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w1,w12,#7 @@ -752,12 +757,12 @@ sha256_block_data_order: add w11,w11,w4 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w11,w11,w1 add w20,w20,w17 // h+=Sigma0(a) add w11,w11,w0 - ldr w0,[sp,#4] - str w3,[sp,#0] + ldr w0,[PTRN(sp),#4] + str w3,[PTRN(sp),#0] ror w16,w24,#6 add w27,w27,w19 // h+=K[i] ror w2,w13,#7 @@ -783,12 +788,12 @@ sha256_block_data_order: add w12,w12,w5 add w23,w23,w27 // d+=h add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round add w12,w12,w2 add w27,w27,w17 // h+=Sigma0(a) add w12,w12,w1 - ldr w1,[sp,#8] - str w4,[sp,#4] + ldr w1,[PTRN(sp),#8] + str w4,[PTRN(sp),#4] ror w16,w23,#6 add w26,w26,w28 // h+=K[i] ror w3,w14,#7 @@ -814,12 +819,12 @@ sha256_block_data_order: add w13,w13,w6 add w22,w22,w26 // d+=h add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w13,w13,w3 add w26,w26,w17 // h+=Sigma0(a) add w13,w13,w2 - ldr w2,[sp,#12] - str w5,[sp,#8] + ldr w2,[PTRN(sp),#12] + str w5,[PTRN(sp),#8] ror w16,w22,#6 add w25,w25,w19 // h+=K[i] ror w4,w15,#7 @@ -845,12 +850,12 @@ sha256_block_data_order: add w14,w14,w7 add w21,w21,w25 // d+=h add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round add w14,w14,w4 add w25,w25,w17 // h+=Sigma0(a) add w14,w14,w3 - ldr w3,[sp,#0] - str w6,[sp,#12] + ldr w3,[PTRN(sp),#0] + str w6,[PTRN(sp),#12] ror w16,w21,#6 add w24,w24,w28 // h+=K[i] ror w5,w0,#7 @@ -876,12 +881,12 @@ sha256_block_data_order: add w15,w15,w8 add w20,w20,w24 // d+=h add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w15,w15,w5 add w24,w24,w17 // h+=Sigma0(a) add w15,w15,w4 - ldr w4,[sp,#4] - str w7,[sp,#0] + ldr w4,[PTRN(sp),#4] + str w7,[PTRN(sp),#0] ror w16,w20,#6 add w23,w23,w19 // h+=K[i] ror w6,w1,#7 @@ -907,12 +912,12 @@ sha256_block_data_order: add w0,w0,w9 add w27,w27,w23 // d+=h add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round add w0,w0,w6 add w23,w23,w17 // h+=Sigma0(a) add w0,w0,w5 - ldr w5,[sp,#8] - str w8,[sp,#4] + ldr w5,[PTRN(sp),#8] + str w8,[PTRN(sp),#4] ror w16,w27,#6 add w22,w22,w28 // h+=K[i] ror w7,w2,#7 @@ -938,12 +943,12 @@ sha256_block_data_order: add w1,w1,w10 add w26,w26,w22 // d+=h add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w1,w1,w7 add w22,w22,w17 // h+=Sigma0(a) add w1,w1,w6 - ldr w6,[sp,#12] - str w9,[sp,#8] + ldr w6,[PTRN(sp),#12] + str w9,[PTRN(sp),#8] ror w16,w26,#6 add w21,w21,w19 // h+=K[i] ror w8,w3,#7 @@ -969,12 +974,12 @@ sha256_block_data_order: add w2,w2,w11 add w25,w25,w21 // d+=h add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round + ldr w28,[PTR(30)],#4 // *K++, w19 in next round add w2,w2,w8 add w21,w21,w17 // h+=Sigma0(a) add w2,w2,w7 - ldr w7,[sp,#0] - str w10,[sp,#12] + ldr w7,[PTRN(sp),#0] + str w10,[PTRN(sp),#12] ror w16,w25,#6 add w20,w20,w28 // h+=K[i] ror w9,w4,#7 @@ -1000,43 +1005,43 @@ sha256_block_data_order: add w3,w3,w12 add w24,w24,w20 // d+=h add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round + ldr w19,[PTR(30)],#4 // *K++, w28 in next round add w3,w3,w9 add w20,w20,w17 // h+=Sigma0(a) add w3,w3,w8 cbnz w19,.Loop_16_xx - ldp x0,x2,[x29,#96] - ldr x1,[x29,#112] - sub x30,x30,#260 // rewind + ldp PTR(0),PTR(2),[PTR(29),#(12*PTR_WIDTH)] + ldr PTR(1),[PTR(29),#(14*PTR_WIDTH)] + sub PTR(30),PTR(30),#260 // rewind - ldp w3,w4,[x0] - ldp w5,w6,[x0,#2*4] - add x1,x1,#14*4 // advance input pointer - ldp w7,w8,[x0,#4*4] + ldp w3,w4,[PTR(0)] + ldp w5,w6,[PTR(0),#2*4] + add PTR(1),PTR(1),#14*4 // advance input pointer + ldp w7,w8,[PTR(0),#4*4] add w20,w20,w3 - ldp w9,w10,[x0,#6*4] + ldp w9,w10,[PTR(0),#6*4] add w21,w21,w4 add w22,w22,w5 add w23,w23,w6 - stp w20,w21,[x0] + stp w20,w21,[PTR(0)] add w24,w24,w7 add w25,w25,w8 - stp w22,w23,[x0,#2*4] + stp w22,w23,[PTR(0),#2*4] add w26,w26,w9 add w27,w27,w10 cmp x1,x2 - stp w24,w25,[x0,#4*4] - stp w26,w27,[x0,#6*4] + stp w24,w25,[PTR(0),#4*4] + stp w26,w27,[PTR(0),#6*4] b.ne .Loop - ldp x19,x20,[x29,#16] - add sp,sp,#4*4 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#4*4 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(16*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size sha256_block_data_order,.-sha256_block_data_order @@ -1071,121 +1076,121 @@ sha256_block_data_order: sha256_block_armv8: .Lv8_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ld1 {v0.4s,v1.4s},[x0] - adr x3,.LK256 + ld1 {v0.4s,v1.4s},[PTR(0)] + adr PTR(3),.LK256 .Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[PTR(1)],#64 sub x2,x2,#1 - ld1 {v16.4s},[x3],#16 + ld1 {v16.4s},[PTR(3)],#16 rev32 v4.16b,v4.16b rev32 v5.16b,v5.16b rev32 v6.16b,v6.16b rev32 v7.16b,v7.16b orr v18.16b,v0.16b,v0.16b // offload orr v19.16b,v1.16b,v1.16b - ld1 {v17.4s},[x3],#16 + ld1 {v17.4s},[PTR(3)],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 + ld1 {v16.4s},[PTR(3)],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 + ld1 {v17.4s},[PTR(3)],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 + ld1 {v16.4s},[PTR(3)],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 + ld1 {v17.4s},[PTR(3)],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 + ld1 {v16.4s},[PTR(3)],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 + ld1 {v17.4s},[PTR(3)],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 + ld1 {v16.4s},[PTR(3)],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 + ld1 {v17.4s},[PTR(3)],#16 add v16.4s,v16.4s,v4.4s .inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 + ld1 {v16.4s},[PTR(3)],#16 add v17.4s,v17.4s,v5.4s .inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 + ld1 {v17.4s},[PTR(3)],#16 add v16.4s,v16.4s,v6.4s .inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s .inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 + ld1 {v16.4s},[PTR(3)],#16 add v17.4s,v17.4s,v7.4s .inst 0x5e282887 //sha256su0 v7.16b,v4.16b orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s .inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 + ld1 {v17.4s},[PTR(3)],#16 add v16.4s,v16.4s,v4.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - ld1 {v16.4s},[x3],#16 + ld1 {v16.4s},[PTR(3)],#16 add v17.4s,v17.4s,v5.4s orr v2.16b,v0.16b,v0.16b .inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s .inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - ld1 {v17.4s},[x3] + ld1 {v17.4s},[PTR(3)] add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind + sub PTR(3),PTR(3),#64*4-16 // rewind orr v2.16b,v0.16b,v0.16b .inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s .inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s @@ -1200,9 +1205,9 @@ sha256_block_armv8: cbnz x2,.Loop_hw - st1 {v0.4s,v1.4s},[x0] + st1 {v0.4s,v1.4s},[PTR(0)] - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size sha256_block_armv8,.-sha256_block_armv8 #endif @@ -1215,39 +1220,44 @@ sha256_block_neon: AARCH64_VALID_CALL_TARGET .Lneon_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 + stp PTR(29), PTR(30), [PTRN(sp), #-(2*PTR_WIDTH)]! + mov PTR(29), PTRN(sp) + sub PTRN(sp),PTRN(sp),#16*4 - adr x16,.LK256 - add x2,x1,x2,lsl#6 // len to point at the end of inp + adr PTR(16),.LK256 +#ifdef __CHERI_PURE_CAPABILITY__ + lsl x17,x2,#6 + add PTR(2),PTR(1),x17 // len to point at the end of inp +#else + add PTR(2),PTR(1),PTR(2),lsl#6 // len to point at the end of inp +#endif - ld1 {v0.16b},[x1], #16 - ld1 {v1.16b},[x1], #16 - ld1 {v2.16b},[x1], #16 - ld1 {v3.16b},[x1], #16 - ld1 {v4.4s},[x16], #16 - ld1 {v5.4s},[x16], #16 - ld1 {v6.4s},[x16], #16 - ld1 {v7.4s},[x16], #16 + ld1 {v0.16b},[PTR(1)], #16 + ld1 {v1.16b},[PTR(1)], #16 + ld1 {v2.16b},[PTR(1)], #16 + ld1 {v3.16b},[PTR(1)], #16 + ld1 {v4.4s},[PTR(16)], #16 + ld1 {v5.4s},[PTR(16)], #16 + ld1 {v6.4s},[PTR(16)], #16 + ld1 {v7.4s},[PTR(16)], #16 rev32 v0.16b,v0.16b // yes, even on rev32 v1.16b,v1.16b // big-endian rev32 v2.16b,v2.16b rev32 v3.16b,v3.16b - mov x17,sp + mov PTR(17),PTRN(sp) add v4.4s,v4.4s,v0.4s add v5.4s,v5.4s,v1.4s add v6.4s,v6.4s,v2.4s - st1 {v4.4s,v5.4s},[x17], #32 + st1 {v4.4s,v5.4s},[PTR(17)], #32 add v7.4s,v7.4s,v3.4s - st1 {v6.4s,v7.4s},[x17] - sub x17,x17,#32 + st1 {v6.4s,v7.4s},[PTR(17)] + sub PTR(17),PTR(17),#32 - ldp w3,w4,[x0] - ldp w5,w6,[x0,#8] - ldp w7,w8,[x0,#16] - ldp w9,w10,[x0,#24] - ldr w12,[sp,#0] + ldp w3,w4,[PTR(0)] + ldp w5,w6,[PTR(0),#8] + ldp w7,w8,[PTR(0),#16] + ldp w9,w10,[PTR(0),#24] + ldr w12,[PTRN(sp),#0] mov w13,wzr eor w14,w4,w5 mov w15,wzr @@ -1277,7 +1287,7 @@ sha256_block_neon: eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 - ldr w12,[sp,#4] + ldr w12,[PTRN(sp),#4] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 @@ -1305,7 +1315,7 @@ sha256_block_neon: eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 - ldr w12,[sp,#8] + ldr w12,[PTRN(sp),#8] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 @@ -1333,10 +1343,10 @@ sha256_block_neon: eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v0.4s,#13 - ldr w12,[sp,#12] + ldr w12,[PTRN(sp),#12] and w14,w14,w13 ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 + ld1 {v4.4s},[PTR(16)], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 @@ -1358,12 +1368,12 @@ sha256_block_neon: eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 - ldr w12,[sp,#16] + ldr w12,[PTRN(sp),#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 - st1 {v4.4s},[x17], #16 + st1 {v4.4s},[PTR(17)], #16 ext v4.16b,v1.16b,v2.16b,#4 add w6,w6,w12 add w7,w7,w15 @@ -1386,7 +1396,7 @@ sha256_block_neon: eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 - ldr w12,[sp,#20] + ldr w12,[PTRN(sp),#20] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 @@ -1414,7 +1424,7 @@ sha256_block_neon: eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 - ldr w12,[sp,#24] + ldr w12,[PTRN(sp),#24] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 @@ -1442,10 +1452,10 @@ sha256_block_neon: eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v1.4s,#13 - ldr w12,[sp,#28] + ldr w12,[PTRN(sp),#28] and w14,w14,w13 ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 + ld1 {v4.4s},[PTR(16)], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 @@ -1467,12 +1477,12 @@ sha256_block_neon: eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 - ldr w12,[sp,#32] + ldr w12,[PTRN(sp),#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 - st1 {v4.4s},[x17], #16 + st1 {v4.4s},[PTR(17)], #16 ext v4.16b,v2.16b,v3.16b,#4 add w10,w10,w12 add w3,w3,w15 @@ -1495,7 +1505,7 @@ sha256_block_neon: eor w15,w15,w3,ror#20 ushr v7.4s,v4.4s,#18 add w10,w10,w11 - ldr w12,[sp,#36] + ldr w12,[PTRN(sp),#36] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 @@ -1523,7 +1533,7 @@ sha256_block_neon: eor w15,w15,w10,ror#20 sli v7.4s,v19.4s,#13 add w9,w9,w11 - ldr w12,[sp,#40] + ldr w12,[PTRN(sp),#40] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 @@ -1551,10 +1561,10 @@ sha256_block_neon: eor w15,w15,w9,ror#20 add w8,w8,w11 sli v17.4s,v2.4s,#13 - ldr w12,[sp,#44] + ldr w12,[PTRN(sp),#44] and w14,w14,w13 ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 + ld1 {v4.4s},[PTR(16)], #16 add w4,w4,w8 eor v19.16b,v19.16b,v17.16b eor w14,w14,w10 @@ -1576,12 +1586,12 @@ sha256_block_neon: eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 - ldr w12,[sp,#48] + ldr w12,[PTRN(sp),#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 - st1 {v4.4s},[x17], #16 + st1 {v4.4s},[PTR(17)], #16 ext v4.16b,v3.16b,v0.16b,#4 add w6,w6,w12 add w7,w7,w15 @@ -1604,7 +1614,7 @@ sha256_block_neon: eor w15,w15,w7,ror#20 ushr v7.4s,v4.4s,#18 add w6,w6,w11 - ldr w12,[sp,#52] + ldr w12,[PTRN(sp),#52] and w14,w14,w13 eor v5.16b,v5.16b,v6.16b ror w15,w15,#2 @@ -1632,7 +1642,7 @@ sha256_block_neon: eor w15,w15,w6,ror#20 sli v7.4s,v19.4s,#13 add w5,w5,w11 - ldr w12,[sp,#56] + ldr w12,[PTRN(sp),#56] and w13,w13,w14 eor v17.16b,v17.16b,v16.16b ror w15,w15,#2 @@ -1660,10 +1670,10 @@ sha256_block_neon: eor w15,w15,w5,ror#20 add w4,w4,w11 sli v17.4s,v3.4s,#13 - ldr w12,[sp,#60] + ldr w12,[PTRN(sp),#60] and w14,w14,w13 ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 + ld1 {v4.4s},[PTR(16)], #16 add w8,w8,w4 eor v19.16b,v19.16b,v17.16b eor w14,w14,w6 @@ -1685,30 +1695,38 @@ sha256_block_neon: eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 - ldr w12,[x16] + ldr w12,[PTR(16)] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 - st1 {v4.4s},[x17], #16 + st1 {v4.4s},[PTR(17)], #16 cmp w12,#0 // check for K256 terminator - ldr w12,[sp,#0] - sub x17,x17,#64 + ldr w12,[PTRN(sp),#0] + sub PTR(17),PTR(17),#64 bne .L_00_48 - sub x16,x16,#256 // rewind x16 + sub PTR(16),PTR(16),#256 // rewind PTR(16) cmp x1,x2 +#ifdef __CHERI_PURE_CAPABILITY__ + mov x17, #-64 +#else mov x17, #64 +#endif csel x17, x17, xzr, eq - sub x1,x1,x17 // avoid SEGV - mov x17,sp +#ifdef __CHERI_PURE_CAPABILITY__ + add PTR(1),PTR(1),x17 // avoid SEGV +#else + sub PTR(1),PTR(1),x17 // avoid SEGV +#endif + mov PTR(17),PTRN(sp) add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 - ld1 {v0.16b},[x1],#16 + ld1 {v0.16b},[PTR(1)],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 + ld1 {v4.4s},[PTR(16)],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 @@ -1720,7 +1738,7 @@ sha256_block_neon: eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v0.4s add w10,w10,w11 - ldr w12,[sp,#4] + ldr w12,[PTRN(sp),#4] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 @@ -1739,7 +1757,7 @@ sha256_block_neon: eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 - ldr w12,[sp,#8] + ldr w12,[PTRN(sp),#8] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 @@ -1758,7 +1776,7 @@ sha256_block_neon: eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 - ldr w12,[sp,#12] + ldr w12,[PTRN(sp),#12] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 @@ -1777,19 +1795,19 @@ sha256_block_neon: eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 - ldr w12,[sp,#16] + ldr w12,[PTRN(sp),#16] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 - st1 {v4.4s},[x17], #16 + st1 {v4.4s},[PTR(17)], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 - ld1 {v1.16b},[x1],#16 + ld1 {v1.16b},[PTR(1)],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 + ld1 {v4.4s},[PTR(16)],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 @@ -1801,7 +1819,7 @@ sha256_block_neon: eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v1.4s add w6,w6,w11 - ldr w12,[sp,#20] + ldr w12,[PTRN(sp),#20] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 @@ -1820,7 +1838,7 @@ sha256_block_neon: eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 - ldr w12,[sp,#24] + ldr w12,[PTRN(sp),#24] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 @@ -1839,7 +1857,7 @@ sha256_block_neon: eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 - ldr w12,[sp,#28] + ldr w12,[PTRN(sp),#28] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 @@ -1858,19 +1876,19 @@ sha256_block_neon: eor w14,w4,w5 eor w15,w15,w4,ror#20 add w3,w3,w11 - ldr w12,[sp,#32] + ldr w12,[PTRN(sp),#32] and w13,w13,w14 ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 - st1 {v4.4s},[x17], #16 + st1 {v4.4s},[PTR(17)], #16 add w10,w10,w12 add w3,w3,w15 and w12,w8,w7 - ld1 {v2.16b},[x1],#16 + ld1 {v2.16b},[PTR(1)],#16 bic w15,w9,w7 eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 + ld1 {v4.4s},[PTR(16)],#16 add w3,w3,w13 orr w12,w12,w15 eor w11,w11,w7,ror#19 @@ -1882,7 +1900,7 @@ sha256_block_neon: eor w15,w15,w3,ror#20 add v4.4s,v4.4s,v2.4s add w10,w10,w11 - ldr w12,[sp,#36] + ldr w12,[PTRN(sp),#36] and w14,w14,w13 ror w15,w15,#2 add w6,w6,w10 @@ -1901,7 +1919,7 @@ sha256_block_neon: eor w14,w10,w3 eor w15,w15,w10,ror#20 add w9,w9,w11 - ldr w12,[sp,#40] + ldr w12,[PTRN(sp),#40] and w13,w13,w14 ror w15,w15,#2 add w5,w5,w9 @@ -1920,7 +1938,7 @@ sha256_block_neon: eor w13,w9,w10 eor w15,w15,w9,ror#20 add w8,w8,w11 - ldr w12,[sp,#44] + ldr w12,[PTRN(sp),#44] and w14,w14,w13 ror w15,w15,#2 add w4,w4,w8 @@ -1939,19 +1957,19 @@ sha256_block_neon: eor w14,w8,w9 eor w15,w15,w8,ror#20 add w7,w7,w11 - ldr w12,[sp,#48] + ldr w12,[PTRN(sp),#48] and w13,w13,w14 ror w15,w15,#2 add w3,w3,w7 eor w13,w13,w9 - st1 {v4.4s},[x17], #16 + st1 {v4.4s},[PTR(17)], #16 add w6,w6,w12 add w7,w7,w15 and w12,w4,w3 - ld1 {v3.16b},[x1],#16 + ld1 {v3.16b},[PTR(1)],#16 bic w15,w5,w3 eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 + ld1 {v4.4s},[PTR(16)],#16 add w7,w7,w13 orr w12,w12,w15 eor w11,w11,w3,ror#19 @@ -1963,7 +1981,7 @@ sha256_block_neon: eor w15,w15,w7,ror#20 add v4.4s,v4.4s,v3.4s add w6,w6,w11 - ldr w12,[sp,#52] + ldr w12,[PTRN(sp),#52] and w14,w14,w13 ror w15,w15,#2 add w10,w10,w6 @@ -1982,7 +2000,7 @@ sha256_block_neon: eor w14,w6,w7 eor w15,w15,w6,ror#20 add w5,w5,w11 - ldr w12,[sp,#56] + ldr w12,[PTRN(sp),#56] and w13,w13,w14 ror w15,w15,#2 add w9,w9,w5 @@ -2001,7 +2019,7 @@ sha256_block_neon: eor w13,w5,w6 eor w15,w15,w5,ror#20 add w4,w4,w11 - ldr w12,[sp,#60] + ldr w12,[PTRN(sp),#60] and w14,w14,w13 ror w15,w15,#2 add w8,w8,w4 @@ -2024,33 +2042,33 @@ sha256_block_neon: ror w15,w15,#2 add w7,w7,w3 eor w13,w13,w5 - st1 {v4.4s},[x17], #16 + st1 {v4.4s},[PTR(17)], #16 add w3,w3,w15 // h+=Sigma0(a) from the past - ldp w11,w12,[x0,#0] + ldp w11,w12,[PTR(0),#0] add w3,w3,w13 // h+=Maj(a,b,c) from the past - ldp w13,w14,[x0,#8] + ldp w13,w14,[PTR(0),#8] add w3,w3,w11 // accumulate add w4,w4,w12 - ldp w11,w12,[x0,#16] + ldp w11,w12,[PTR(0),#16] add w5,w5,w13 add w6,w6,w14 - ldp w13,w14,[x0,#24] + ldp w13,w14,[PTR(0),#24] add w7,w7,w11 add w8,w8,w12 - ldr w12,[sp,#0] - stp w3,w4,[x0,#0] + ldr w12,[PTRN(sp),#0] + stp w3,w4,[PTR(0),#0] add w9,w9,w13 mov w13,wzr - stp w5,w6,[x0,#8] + stp w5,w6,[PTR(0),#8] add w10,w10,w14 - stp w7,w8,[x0,#16] + stp w7,w8,[PTR(0),#16] eor w14,w4,w5 - stp w9,w10,[x0,#24] + stp w9,w10,[PTR(0),#24] mov w15,wzr - mov x17,sp + mov PTR(17),PTRN(sp) b.ne .L_00_48 - ldr x29,[x29] - add sp,sp,#16*4+16 + ldr PTR(29),[PTR(29)] + add PTRN(sp),PTRN(sp),#16*4+(2*PTR_WIDTH) ret .size sha256_block_neon,.-sha256_block_neon diff --git a/sys/crypto/openssl/aarch64/sha512-armv8.S b/sys/crypto/openssl/aarch64/sha512-armv8.S index c119d9cf5c95..9828ef4806fc 100644 --- a/sys/crypto/openssl/aarch64/sha512-armv8.S +++ b/sys/crypto/openssl/aarch64/sha512-armv8.S @@ -70,35 +70,40 @@ sha512_block_data_order: AARCH64_VALID_CALL_TARGET #ifndef __KERNEL__ - adrp x16,OPENSSL_armcap_P - ldr w16,[x16,#:lo12:OPENSSL_armcap_P] + adrp PTR(16),OPENSSL_armcap_P + ldr w16,[PTR(16),#:lo12:OPENSSL_armcap_P] tst w16,#ARMV8_SHA512 b.ne .Lv8_entry #endif AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(16*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*8 + stp PTR(19),PTR(20),[PTRN(sp),#(2*PTR_WIDTH)] + stp PTR(21),PTR(22),[PTRN(sp),#(4*PTR_WIDTH)] + stp PTR(23),PTR(24),[PTRN(sp),#(6*PTR_WIDTH)] + stp PTR(25),PTR(26),[PTRN(sp),#(8*PTR_WIDTH)] + stp PTR(27),PTR(28),[PTRN(sp),#(10*PTR_WIDTH)] + sub PTRN(sp),PTRN(sp),#4*8 - ldp x20,x21,[x0] // load context - ldp x22,x23,[x0,#2*8] - ldp x24,x25,[x0,#4*8] - add x2,x1,x2,lsl#7 // end of input - ldp x26,x27,[x0,#6*8] - adr x30,.LK512 - stp x0,x2,[x29,#96] + ldp x20,x21,[PTR(0)] // load context + ldp x22,x23,[PTR(0),#2*8] + ldp x24,x25,[PTR(0),#4*8] +#ifdef __CHERI_PURE_CAPABILITY__ + lsl x17,x2,#7 + add PTR(2),PTR(1),x17 // end of input +#else + add PTR(2),PTR(1),PTR(2),lsl#7 // end of input +#endif + ldp x26,x27,[PTR(0),#6*8] + adr PTR(30),.LK512 + stp PTR(0),PTR(2),[PTR(29),#(12*PTR_WIDTH)] .Loop: - ldp x3,x4,[x1],#2*8 - ldr x19,[x30],#8 // *K++ + ldp x3,x4,[PTR(1)],#2*8 + ldr x19,[PTR(30)],#8 // *K++ eor x28,x21,x22 // magic seed - str x1,[x29,#112] + str PTR(1),[PTR(29),#(14*PTR_WIDTH)] #ifndef __AARCH64EB__ rev x3,x3 // 0 #endif @@ -120,12 +125,12 @@ sha512_block_data_order: eor x28,x28,x21 // Maj(a,b,c) eor x17,x6,x17,ror#34 // Sigma0(a) add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round //add x27,x27,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x4,x4 // 1 #endif - ldp x5,x6,[x1],#2*8 + ldp x5,x6,[PTR(1)],#2*8 add x27,x27,x17 // h+=Sigma0(a) ror x16,x23,#14 add x26,x26,x28 // h+=K[i] @@ -145,7 +150,7 @@ sha512_block_data_order: eor x19,x19,x20 // Maj(a,b,c) eor x17,x7,x17,ror#34 // Sigma0(a) add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round //add x26,x26,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x5,x5 // 2 @@ -169,12 +174,12 @@ sha512_block_data_order: eor x28,x28,x27 // Maj(a,b,c) eor x17,x8,x17,ror#34 // Sigma0(a) add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round //add x25,x25,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x6,x6 // 3 #endif - ldp x7,x8,[x1],#2*8 + ldp x7,x8,[PTR(1)],#2*8 add x25,x25,x17 // h+=Sigma0(a) ror x16,x21,#14 add x24,x24,x28 // h+=K[i] @@ -194,7 +199,7 @@ sha512_block_data_order: eor x19,x19,x26 // Maj(a,b,c) eor x17,x9,x17,ror#34 // Sigma0(a) add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round //add x24,x24,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x7,x7 // 4 @@ -218,12 +223,12 @@ sha512_block_data_order: eor x28,x28,x25 // Maj(a,b,c) eor x17,x10,x17,ror#34 // Sigma0(a) add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round //add x23,x23,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x8,x8 // 5 #endif - ldp x9,x10,[x1],#2*8 + ldp x9,x10,[PTR(1)],#2*8 add x23,x23,x17 // h+=Sigma0(a) ror x16,x27,#14 add x22,x22,x28 // h+=K[i] @@ -243,7 +248,7 @@ sha512_block_data_order: eor x19,x19,x24 // Maj(a,b,c) eor x17,x11,x17,ror#34 // Sigma0(a) add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round //add x22,x22,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x9,x9 // 6 @@ -267,12 +272,12 @@ sha512_block_data_order: eor x28,x28,x23 // Maj(a,b,c) eor x17,x12,x17,ror#34 // Sigma0(a) add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round //add x21,x21,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x10,x10 // 7 #endif - ldp x11,x12,[x1],#2*8 + ldp x11,x12,[PTR(1)],#2*8 add x21,x21,x17 // h+=Sigma0(a) ror x16,x25,#14 add x20,x20,x28 // h+=K[i] @@ -292,7 +297,7 @@ sha512_block_data_order: eor x19,x19,x22 // Maj(a,b,c) eor x17,x13,x17,ror#34 // Sigma0(a) add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round //add x20,x20,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x11,x11 // 8 @@ -316,12 +321,12 @@ sha512_block_data_order: eor x28,x28,x21 // Maj(a,b,c) eor x17,x14,x17,ror#34 // Sigma0(a) add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round //add x27,x27,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x12,x12 // 9 #endif - ldp x13,x14,[x1],#2*8 + ldp x13,x14,[PTR(1)],#2*8 add x27,x27,x17 // h+=Sigma0(a) ror x16,x23,#14 add x26,x26,x28 // h+=K[i] @@ -341,7 +346,7 @@ sha512_block_data_order: eor x19,x19,x20 // Maj(a,b,c) eor x17,x15,x17,ror#34 // Sigma0(a) add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round //add x26,x26,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x13,x13 // 10 @@ -365,14 +370,14 @@ sha512_block_data_order: eor x28,x28,x27 // Maj(a,b,c) eor x17,x0,x17,ror#34 // Sigma0(a) add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round //add x25,x25,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x14,x14 // 11 #endif - ldp x15,x0,[x1],#2*8 + ldp x15,x0,[PTR(1)],#2*8 add x25,x25,x17 // h+=Sigma0(a) - str x6,[sp,#24] + str x6,[PTRN(sp),#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] eor x6,x21,x21,ror#23 @@ -391,13 +396,13 @@ sha512_block_data_order: eor x19,x19,x26 // Maj(a,b,c) eor x17,x6,x17,ror#34 // Sigma0(a) add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round //add x24,x24,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x15,x15 // 12 #endif add x24,x24,x17 // h+=Sigma0(a) - str x7,[sp,#0] + str x7,[PTRN(sp),#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] eor x7,x20,x20,ror#23 @@ -416,14 +421,14 @@ sha512_block_data_order: eor x28,x28,x25 // Maj(a,b,c) eor x17,x7,x17,ror#34 // Sigma0(a) add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round //add x23,x23,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x0,x0 // 13 #endif - ldp x1,x2,[x1] + ldp x1,x2,[PTR(1)] add x23,x23,x17 // h+=Sigma0(a) - str x8,[sp,#8] + str x8,[PTRN(sp),#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] eor x8,x27,x27,ror#23 @@ -442,14 +447,14 @@ sha512_block_data_order: eor x19,x19,x24 // Maj(a,b,c) eor x17,x8,x17,ror#34 // Sigma0(a) add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round //add x22,x22,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x1,x1 // 14 #endif - ldr x6,[sp,#24] + ldr x6,[PTRN(sp),#24] add x22,x22,x17 // h+=Sigma0(a) - str x9,[sp,#16] + str x9,[PTRN(sp),#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] eor x9,x26,x26,ror#23 @@ -468,14 +473,14 @@ sha512_block_data_order: eor x28,x28,x23 // Maj(a,b,c) eor x17,x9,x17,ror#34 // Sigma0(a) add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round //add x21,x21,x17 // h+=Sigma0(a) #ifndef __AARCH64EB__ rev x2,x2 // 15 #endif - ldr x7,[sp,#0] + ldr x7,[PTRN(sp),#0] add x21,x21,x17 // h+=Sigma0(a) - str x10,[sp,#24] + str x10,[PTRN(sp),#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x9,x4,#1 @@ -501,13 +506,13 @@ sha512_block_data_order: add x3,x3,x12 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x3,x3,x9 add x20,x20,x17 // h+=Sigma0(a) add x3,x3,x8 .Loop_16_xx: - ldr x8,[sp,#8] - str x11,[sp,#0] + ldr x8,[PTRN(sp),#8] + str x11,[PTRN(sp),#0] ror x16,x24,#14 add x27,x27,x19 // h+=K[i] ror x10,x5,#1 @@ -533,12 +538,12 @@ sha512_block_data_order: add x4,x4,x13 add x23,x23,x27 // d+=h add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round add x4,x4,x10 add x27,x27,x17 // h+=Sigma0(a) add x4,x4,x9 - ldr x9,[sp,#16] - str x12,[sp,#8] + ldr x9,[PTRN(sp),#16] + str x12,[PTRN(sp),#8] ror x16,x23,#14 add x26,x26,x28 // h+=K[i] ror x11,x6,#1 @@ -564,12 +569,12 @@ sha512_block_data_order: add x5,x5,x14 add x22,x22,x26 // d+=h add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x5,x5,x11 add x26,x26,x17 // h+=Sigma0(a) add x5,x5,x10 - ldr x10,[sp,#24] - str x13,[sp,#16] + ldr x10,[PTRN(sp),#24] + str x13,[PTRN(sp),#16] ror x16,x22,#14 add x25,x25,x19 // h+=K[i] ror x12,x7,#1 @@ -595,12 +600,12 @@ sha512_block_data_order: add x6,x6,x15 add x21,x21,x25 // d+=h add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round add x6,x6,x12 add x25,x25,x17 // h+=Sigma0(a) add x6,x6,x11 - ldr x11,[sp,#0] - str x14,[sp,#24] + ldr x11,[PTRN(sp),#0] + str x14,[PTRN(sp),#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] ror x13,x8,#1 @@ -626,12 +631,12 @@ sha512_block_data_order: add x7,x7,x0 add x20,x20,x24 // d+=h add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x7,x7,x13 add x24,x24,x17 // h+=Sigma0(a) add x7,x7,x12 - ldr x12,[sp,#8] - str x15,[sp,#0] + ldr x12,[PTRN(sp),#8] + str x15,[PTRN(sp),#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] ror x14,x9,#1 @@ -657,12 +662,12 @@ sha512_block_data_order: add x8,x8,x1 add x27,x27,x23 // d+=h add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round add x8,x8,x14 add x23,x23,x17 // h+=Sigma0(a) add x8,x8,x13 - ldr x13,[sp,#16] - str x0,[sp,#8] + ldr x13,[PTRN(sp),#16] + str x0,[PTRN(sp),#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] ror x15,x10,#1 @@ -688,12 +693,12 @@ sha512_block_data_order: add x9,x9,x2 add x26,x26,x22 // d+=h add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x9,x9,x15 add x22,x22,x17 // h+=Sigma0(a) add x9,x9,x14 - ldr x14,[sp,#24] - str x1,[sp,#16] + ldr x14,[PTRN(sp),#24] + str x1,[PTRN(sp),#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] ror x0,x11,#1 @@ -719,12 +724,12 @@ sha512_block_data_order: add x10,x10,x3 add x25,x25,x21 // d+=h add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round add x10,x10,x0 add x21,x21,x17 // h+=Sigma0(a) add x10,x10,x15 - ldr x15,[sp,#0] - str x2,[sp,#24] + ldr x15,[PTRN(sp),#0] + str x2,[PTRN(sp),#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x1,x12,#1 @@ -750,12 +755,12 @@ sha512_block_data_order: add x11,x11,x4 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x11,x11,x1 add x20,x20,x17 // h+=Sigma0(a) add x11,x11,x0 - ldr x0,[sp,#8] - str x3,[sp,#0] + ldr x0,[PTRN(sp),#8] + str x3,[PTRN(sp),#0] ror x16,x24,#14 add x27,x27,x19 // h+=K[i] ror x2,x13,#1 @@ -781,12 +786,12 @@ sha512_block_data_order: add x12,x12,x5 add x23,x23,x27 // d+=h add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round add x12,x12,x2 add x27,x27,x17 // h+=Sigma0(a) add x12,x12,x1 - ldr x1,[sp,#16] - str x4,[sp,#8] + ldr x1,[PTRN(sp),#16] + str x4,[PTRN(sp),#8] ror x16,x23,#14 add x26,x26,x28 // h+=K[i] ror x3,x14,#1 @@ -812,12 +817,12 @@ sha512_block_data_order: add x13,x13,x6 add x22,x22,x26 // d+=h add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x13,x13,x3 add x26,x26,x17 // h+=Sigma0(a) add x13,x13,x2 - ldr x2,[sp,#24] - str x5,[sp,#16] + ldr x2,[PTRN(sp),#24] + str x5,[PTRN(sp),#16] ror x16,x22,#14 add x25,x25,x19 // h+=K[i] ror x4,x15,#1 @@ -843,12 +848,12 @@ sha512_block_data_order: add x14,x14,x7 add x21,x21,x25 // d+=h add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round add x14,x14,x4 add x25,x25,x17 // h+=Sigma0(a) add x14,x14,x3 - ldr x3,[sp,#0] - str x6,[sp,#24] + ldr x3,[PTRN(sp),#0] + str x6,[PTRN(sp),#24] ror x16,x21,#14 add x24,x24,x28 // h+=K[i] ror x5,x0,#1 @@ -874,12 +879,12 @@ sha512_block_data_order: add x15,x15,x8 add x20,x20,x24 // d+=h add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x15,x15,x5 add x24,x24,x17 // h+=Sigma0(a) add x15,x15,x4 - ldr x4,[sp,#8] - str x7,[sp,#0] + ldr x4,[PTRN(sp),#8] + str x7,[PTRN(sp),#0] ror x16,x20,#14 add x23,x23,x19 // h+=K[i] ror x6,x1,#1 @@ -905,12 +910,12 @@ sha512_block_data_order: add x0,x0,x9 add x27,x27,x23 // d+=h add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round add x0,x0,x6 add x23,x23,x17 // h+=Sigma0(a) add x0,x0,x5 - ldr x5,[sp,#16] - str x8,[sp,#8] + ldr x5,[PTRN(sp),#16] + str x8,[PTRN(sp),#8] ror x16,x27,#14 add x22,x22,x28 // h+=K[i] ror x7,x2,#1 @@ -936,12 +941,12 @@ sha512_block_data_order: add x1,x1,x10 add x26,x26,x22 // d+=h add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x1,x1,x7 add x22,x22,x17 // h+=Sigma0(a) add x1,x1,x6 - ldr x6,[sp,#24] - str x9,[sp,#16] + ldr x6,[PTRN(sp),#24] + str x9,[PTRN(sp),#16] ror x16,x26,#14 add x21,x21,x19 // h+=K[i] ror x8,x3,#1 @@ -967,12 +972,12 @@ sha512_block_data_order: add x2,x2,x11 add x25,x25,x21 // d+=h add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round + ldr x28,[PTR(30)],#8 // *K++, x19 in next round add x2,x2,x8 add x21,x21,x17 // h+=Sigma0(a) add x2,x2,x7 - ldr x7,[sp,#0] - str x10,[sp,#24] + ldr x7,[PTRN(sp),#0] + str x10,[PTRN(sp),#24] ror x16,x25,#14 add x20,x20,x28 // h+=K[i] ror x9,x4,#1 @@ -998,43 +1003,43 @@ sha512_block_data_order: add x3,x3,x12 add x24,x24,x20 // d+=h add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round + ldr x19,[PTR(30)],#8 // *K++, x28 in next round add x3,x3,x9 add x20,x20,x17 // h+=Sigma0(a) add x3,x3,x8 cbnz x19,.Loop_16_xx - ldp x0,x2,[x29,#96] - ldr x1,[x29,#112] - sub x30,x30,#648 // rewind + ldp PTR(0),PTR(2),[PTR(29),#(12*PTR_WIDTH)] + ldr PTR(1),[PTR(29),#(14*PTR_WIDTH)] + sub PTR(30),PTR(30),#648 // rewind - ldp x3,x4,[x0] - ldp x5,x6,[x0,#2*8] - add x1,x1,#14*8 // advance input pointer - ldp x7,x8,[x0,#4*8] + ldp x3,x4,[PTR(0)] + ldp x5,x6,[PTR(0),#2*8] + add PTR(1),PTR(1),#14*8 // advance input pointer + ldp x7,x8,[PTR(0),#4*8] add x20,x20,x3 - ldp x9,x10,[x0,#6*8] + ldp x9,x10,[PTR(0),#6*8] add x21,x21,x4 add x22,x22,x5 add x23,x23,x6 - stp x20,x21,[x0] + stp x20,x21,[PTR(0)] add x24,x24,x7 add x25,x25,x8 - stp x22,x23,[x0,#2*8] + stp x22,x23,[PTR(0),#2*8] add x26,x26,x9 add x27,x27,x10 cmp x1,x2 - stp x24,x25,[x0,#4*8] - stp x26,x27,[x0,#6*8] + stp x24,x25,[PTR(0),#4*8] + stp x26,x27,[PTR(0),#6*8] b.ne .Loop - ldp x19,x20,[x29,#16] - add sp,sp,#4*8 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 + ldp PTR(19),PTR(20),[PTR(29),#(2*PTR_WIDTH)] + add PTRN(sp),PTRN(sp),#4*8 + ldp PTR(21),PTR(22),[PTR(29),#(4*PTR_WIDTH)] + ldp PTR(23),PTR(24),[PTR(29),#(6*PTR_WIDTH)] + ldp PTR(25),PTR(26),[PTR(29),#(8*PTR_WIDTH)] + ldp PTR(27),PTR(28),[PTR(29),#(10*PTR_WIDTH)] + ldp PTR(29),PTR(30),[PTRN(sp)],#(16*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size sha512_block_data_order,.-sha512_block_data_order @@ -1093,14 +1098,14 @@ sha512_block_data_order: sha512_block_armv8: .Lv8_entry: // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[PTR(1)],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[PTR(1)],#64 - ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context - adr x3,.LK512 + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[PTR(0)] // load context + adr PTR(3),.LK512 rev64 v16.16b,v16.16b rev64 v17.16b,v17.16b @@ -1114,16 +1119,16 @@ sha512_block_armv8: .align 4 .Loop_hw: - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 subs x2,x2,#1 - sub x4,x1,#128 + sub PTR(4),PTR(1),#128 orr v26.16b,v0.16b,v0.16b // offload orr v27.16b,v1.16b,v1.16b orr v28.16b,v2.16b,v2.16b orr v29.16b,v3.16b,v3.16b - csel x1,x1,x4,ne // conditional rewind + csel PTR(1),PTR(1),PTR(4),ne // conditional rewind add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 @@ -1135,7 +1140,7 @@ sha512_block_armv8: add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 @@ -1147,7 +1152,7 @@ sha512_block_armv8: add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 @@ -1159,7 +1164,7 @@ sha512_block_armv8: add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 @@ -1171,7 +1176,7 @@ sha512_block_armv8: add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 @@ -1183,7 +1188,7 @@ sha512_block_armv8: add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 @@ -1195,7 +1200,7 @@ sha512_block_armv8: add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 @@ -1207,7 +1212,7 @@ sha512_block_armv8: add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 @@ -1219,7 +1224,7 @@ sha512_block_armv8: add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 @@ -1231,7 +1236,7 @@ sha512_block_armv8: add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 @@ -1243,7 +1248,7 @@ sha512_block_armv8: add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 @@ -1255,7 +1260,7 @@ sha512_block_armv8: add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 @@ -1267,7 +1272,7 @@ sha512_block_armv8: add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 @@ -1279,7 +1284,7 @@ sha512_block_armv8: add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 @@ -1291,7 +1296,7 @@ sha512_block_armv8: add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 @@ -1303,7 +1308,7 @@ sha512_block_armv8: add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 @@ -1315,7 +1320,7 @@ sha512_block_armv8: add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 @@ -1327,7 +1332,7 @@ sha512_block_armv8: add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 @@ -1339,7 +1344,7 @@ sha512_block_armv8: add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 @@ -1351,7 +1356,7 @@ sha512_block_armv8: add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 @@ -1363,7 +1368,7 @@ sha512_block_armv8: add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 @@ -1375,7 +1380,7 @@ sha512_block_armv8: add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 @@ -1387,7 +1392,7 @@ sha512_block_armv8: add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 @@ -1399,7 +1404,7 @@ sha512_block_armv8: add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 @@ -1411,7 +1416,7 @@ sha512_block_armv8: add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 @@ -1423,7 +1428,7 @@ sha512_block_armv8: add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 @@ -1435,7 +1440,7 @@ sha512_block_armv8: add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 @@ -1447,7 +1452,7 @@ sha512_block_armv8: add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 @@ -1459,7 +1464,7 @@ sha512_block_armv8: add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 @@ -1471,7 +1476,7 @@ sha512_block_armv8: add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 @@ -1483,7 +1488,7 @@ sha512_block_armv8: add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 @@ -1495,7 +1500,7 @@ sha512_block_armv8: add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 @@ -1506,9 +1511,9 @@ sha512_block_armv8: .inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 add v24.2d,v24.2d,v16.2d - ld1 {v16.16b},[x1],#16 // load next input + ld1 {v16.16b},[PTR(1)],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 @@ -1517,9 +1522,9 @@ sha512_block_armv8: rev64 v16.16b,v16.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 add v25.2d,v25.2d,v17.2d - ld1 {v17.16b},[x1],#16 // load next input + ld1 {v17.16b},[PTR(1)],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 @@ -1528,9 +1533,9 @@ sha512_block_armv8: rev64 v17.16b,v17.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 add v24.2d,v24.2d,v18.2d - ld1 {v18.16b},[x1],#16 // load next input + ld1 {v18.16b},[PTR(1)],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 @@ -1539,9 +1544,9 @@ sha512_block_armv8: rev64 v18.16b,v18.16b add v2.2d,v4.2d,v0.2d // "D + T1" .inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 add v25.2d,v25.2d,v19.2d - ld1 {v19.16b},[x1],#16 // load next input + ld1 {v19.16b},[PTR(1)],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v2.16b,v3.16b,#8 ext v6.16b,v1.16b,v2.16b,#8 @@ -1550,9 +1555,9 @@ sha512_block_armv8: rev64 v19.16b,v19.16b add v4.2d,v1.2d,v3.2d // "D + T1" .inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 add v24.2d,v24.2d,v20.2d - ld1 {v20.16b},[x1],#16 // load next input + ld1 {v20.16b},[PTR(1)],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v4.16b,v2.16b,#8 ext v6.16b,v0.16b,v4.16b,#8 @@ -1561,9 +1566,9 @@ sha512_block_armv8: rev64 v20.16b,v20.16b add v1.2d,v0.2d,v2.2d // "D + T1" .inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - ld1 {v24.2d},[x3],#16 + ld1 {v24.2d},[PTR(3)],#16 add v25.2d,v25.2d,v21.2d - ld1 {v21.16b},[x1],#16 // load next input + ld1 {v21.16b},[PTR(1)],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v1.16b,v4.16b,#8 ext v6.16b,v3.16b,v1.16b,#8 @@ -1572,9 +1577,9 @@ sha512_block_armv8: rev64 v21.16b,v21.16b add v0.2d,v3.2d,v4.2d // "D + T1" .inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - ld1 {v25.2d},[x3],#16 + ld1 {v25.2d},[PTR(3)],#16 add v24.2d,v24.2d,v22.2d - ld1 {v22.16b},[x1],#16 // load next input + ld1 {v22.16b},[PTR(1)],#16 // load next input ext v24.16b,v24.16b,v24.16b,#8 ext v5.16b,v0.16b,v1.16b,#8 ext v6.16b,v2.16b,v0.16b,#8 @@ -1583,9 +1588,9 @@ sha512_block_armv8: rev64 v22.16b,v22.16b add v3.2d,v2.2d,v1.2d // "D + T1" .inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - sub x3,x3,#80*8 // rewind + sub PTR(3),PTR(3),#80*8 // rewind add v25.2d,v25.2d,v23.2d - ld1 {v23.16b},[x1],#16 // load next input + ld1 {v23.16b},[PTR(1)],#16 // load next input ext v25.16b,v25.16b,v25.16b,#8 ext v5.16b,v3.16b,v0.16b,#8 ext v6.16b,v4.16b,v3.16b,#8 @@ -1601,9 +1606,9 @@ sha512_block_armv8: cbnz x2,.Loop_hw - st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[PTR(0)] // store context - ldr x29,[sp],#16 + ldr PTR(29),[PTRN(sp)],#(2*PTR_WIDTH) ret .size sha512_block_armv8,.-sha512_block_armv8 #endif diff --git a/sys/crypto/openssl/aarch64/vpaes-armv8.S b/sys/crypto/openssl/aarch64/vpaes-armv8.S index c6338b00d5f6..34491bd0e80a 100644 --- a/sys/crypto/openssl/aarch64/vpaes-armv8.S +++ b/sys/crypto/openssl/aarch64/vpaes-armv8.S @@ -102,11 +102,11 @@ _vpaes_consts: .type _vpaes_encrypt_preheat,%function .align 4 _vpaes_encrypt_preheat: - adr x10, .Lk_inv + adr PTR(10), .Lk_inv movi v17.16b, #0x0f - ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv - ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ld1 {v18.2d,v19.2d}, [PTR(10)],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [PTR(10)],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [PTR(10)] // .Lk_sb1, .Lk_sb2 ret .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat @@ -128,11 +128,11 @@ _vpaes_encrypt_preheat: .type _vpaes_encrypt_core,%function .align 4 _vpaes_encrypt_core: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - adr x11, .Lk_mc_forward+16 + mov PTR(9), PTR(2) + ldr w8, [PTR(2),#240] // pull rounds + adr PTR(11), .Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + ld1 {v16.2d}, [PTR(9)], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 @@ -145,22 +145,26 @@ _vpaes_encrypt_core: .align 4 .Lenc_loop: // middle of middle round - add x10, x11, #0x40 + add PTR(10), PTR(11), #0x40 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + ld1 {v1.2d}, [PTR(11)], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + ld1 {v4.2d}, [PTR(10)] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D +#ifdef __CHERI_PURE_CAPABILITY__ + alignd c11, c11, #6 // and $0x30, %r11 # ... mod 4 +#else and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 +#endif eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D sub w8, w8, #1 // nr-- @@ -178,15 +182,15 @@ _vpaes_encrypt_core: tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_loop // middle of last round - add x10, x11, #0x80 + add PTR(10), PTR(11), #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + ld1 {v1.2d}, [PTR(10)] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A @@ -199,15 +203,15 @@ _vpaes_encrypt_core: .align 4 vpaes_encrypt: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ld1 {v7.16b}, [x0] + ld1 {v7.16b}, [PTR(0)] bl _vpaes_encrypt_preheat bl _vpaes_encrypt_core - st1 {v0.16b}, [x1] + st1 {v0.16b}, [PTR(1)] - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_encrypt,.-vpaes_encrypt @@ -215,11 +219,11 @@ vpaes_encrypt: .type _vpaes_encrypt_2x,%function .align 4 _vpaes_encrypt_2x: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - adr x11, .Lk_mc_forward+16 + mov PTR(9), PTR(2) + ldr w8, [PTR(2),#240] // pull rounds + adr PTR(11), .Lk_mc_forward+16 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + ld1 {v16.2d}, [PTR(9)], #16 // vmovdqu (%r9), %xmm5 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b @@ -238,10 +242,10 @@ _vpaes_encrypt_2x: .align 4 .Lenc_2x_loop: // middle of middle round - add x10, x11, #0x40 + add PTR(10), PTR(11), #0x40 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u tbl v12.16b, {v25.16b}, v10.16b - ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + ld1 {v1.2d}, [PTR(11)], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t tbl v8.16b, {v24.16b}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k @@ -252,7 +256,7 @@ _vpaes_encrypt_2x: eor v8.16b, v8.16b, v12.16b tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t tbl v10.16b, {v26.16b}, v11.16b - ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + ld1 {v4.2d}, [PTR(10)] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B tbl v11.16b, {v8.16b}, v1.16b eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A @@ -265,7 +269,11 @@ _vpaes_encrypt_2x: tbl v12.16b, {v11.16b},v1.16b eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D eor v8.16b, v8.16b, v11.16b +#ifdef __CHERI_PURE_CAPABILITY__ + alignd c11, c11, #6 // and $0x30, %r11 # ... mod 4 +#else and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 +#endif eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D eor v8.16b, v8.16b, v12.16b sub w8, w8, #1 // nr-- @@ -296,16 +304,16 @@ _vpaes_encrypt_2x: eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm5 cbnz w8, .Lenc_2x_loop // middle of last round - add x10, x11, #0x80 + add PTR(10), PTR(11), #0x80 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou tbl v12.16b, {v22.16b}, v10.16b - ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + ld1 {v1.2d}, [PTR(10)] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t tbl v8.16b, {v23.16b}, v11.16b eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k @@ -320,13 +328,13 @@ _vpaes_encrypt_2x: .type _vpaes_decrypt_preheat,%function .align 4 _vpaes_decrypt_preheat: - adr x10, .Lk_inv + adr PTR(10), .Lk_inv movi v17.16b, #0x0f - adr x11, .Lk_dipt - ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv - ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd - ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe + adr PTR(11), .Lk_dipt + ld1 {v18.2d,v19.2d}, [PTR(10)],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [PTR(11)],#64 // .Lk_dipt, .Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [PTR(11)],#64 // .Lk_dsb9, .Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [PTR(11)] // .Lk_dsbb, .Lk_dsbe ret .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat @@ -338,22 +346,22 @@ _vpaes_decrypt_preheat: .type _vpaes_decrypt_core,%function .align 4 _vpaes_decrypt_core: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds + mov PTR(9), PTR(2) + ldr w8, [PTR(2),#240] // pull rounds // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 eor x11, x11, #0x30 // xor $0x30, %r11 - adr x10, .Lk_sr + adr PTR(10), .Lk_sr and x11, x11, #0x30 // and $0x30, %r11 - add x11, x11, x10 - adr x10, .Lk_mc_forward+48 + add PTR(11), PTR(10), x11 + adr PTR(10), .Lk_mc_forward+48 - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm4 # round0 key and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 - ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + ld1 {v5.2d}, [PTR(10)] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 @@ -412,14 +420,14 @@ _vpaes_decrypt_core: tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm0 cbnz w8, .Ldec_loop // middle of last round // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + ld1 {v2.2d}, [PTR(11)] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A @@ -432,15 +440,15 @@ _vpaes_decrypt_core: .align 4 vpaes_decrypt: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 - ld1 {v7.16b}, [x0] + ld1 {v7.16b}, [PTR(0)] bl _vpaes_decrypt_preheat bl _vpaes_decrypt_core - st1 {v0.16b}, [x1] + st1 {v0.16b}, [PTR(1)] - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_decrypt,.-vpaes_decrypt @@ -449,25 +457,25 @@ vpaes_decrypt: .type _vpaes_decrypt_2x,%function .align 4 _vpaes_decrypt_2x: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds + mov PTR(9), PTR(2) + ldr w8, [PTR(2),#240] // pull rounds // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 eor x11, x11, #0x30 // xor $0x30, %r11 - adr x10, .Lk_sr + adr PTR(10), .Lk_sr and x11, x11, #0x30 // and $0x30, %r11 - add x11, x11, x10 - adr x10, .Lk_mc_forward+48 + add PTR(11), PTR(10), x11 + adr PTR(10), .Lk_mc_forward+48 - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm4 # round0 key and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 and v9.16b, v15.16b, v17.16b ushr v8.16b, v15.16b, #4 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 tbl v10.16b, {v20.16b},v9.16b - ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + ld1 {v5.2d}, [PTR(10)] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 tbl v8.16b, {v21.16b},v8.16b @@ -560,7 +568,7 @@ _vpaes_decrypt_2x: eor v10.16b, v10.16b, v9.16b eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo eor v11.16b, v11.16b, v8.16b - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + ld1 {v16.2d}, [PTR(9)],#16 // vmovdqu (%r9), %xmm0 cbnz w8, .Ldec_2x_loop // middle of last round @@ -570,7 +578,7 @@ _vpaes_decrypt_2x: // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t tbl v9.16b, {v23.16b}, v11.16b - ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + ld1 {v2.2d}, [PTR(11)] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k eor v12.16b, v12.16b, v16.16b eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A @@ -587,18 +595,18 @@ _vpaes_decrypt_2x: .type _vpaes_key_preheat,%function .align 4 _vpaes_key_preheat: - adr x10, .Lk_inv + adr PTR(10), .Lk_inv movi v16.16b, #0x5b // .Lk_s63 - adr x11, .Lk_sb1 + adr PTR(11), .Lk_sb1 movi v17.16b, #0x0f // .Lk_s0F - ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt - adr x10, .Lk_dksd - ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 - adr x11, .Lk_mc_forward - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb - ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 - ld1 {v8.2d}, [x10] // .Lk_rcon - ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [PTR(10)] // .Lk_inv, .Lk_ipt + adr PTR(10), .Lk_dksd + ld1 {v22.2d,v23.2d}, [PTR(11)] // .Lk_sb1 + adr PTR(11), .Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [PTR(10)],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [PTR(10)],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [PTR(10)] // .Lk_rcon + ld1 {v9.2d}, [PTR(11)] // .Lk_mc_forward[0] ret .size _vpaes_key_preheat,.-_vpaes_key_preheat @@ -606,31 +614,31 @@ _vpaes_key_preheat: .align 4 _vpaes_schedule_core: AARCH64_SIGN_LINK_REGISTER - stp x29, x30, [sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 bl _vpaes_key_preheat // load the tables - ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + ld1 {v0.16b}, [PTR(0)],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) // input transform mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 bl _vpaes_schedule_transform mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 - adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10 - add x8, x8, x10 + adr PTR(10), .Lk_sr // lea .Lk_sr(%rip),%r10 + add PTR(8), PTR(10), x8 cbnz w3, .Lschedule_am_decrypting // encrypting, output zeroth round key after transform - st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + st1 {v0.2d}, [PTR(2)] // vmovdqu %xmm0, (%rdx) b .Lschedule_go .Lschedule_am_decrypting: // decrypting, output zeroth round key after shiftrows - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + ld1 {v1.2d}, [PTR(8)] // vmovdqa (%r8,%r10), %xmm1 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + st1 {v3.2d}, [PTR(2)] // vmovdqu %xmm3, (%rdx) eor x8, x8, #0x30 // xor $0x30, %r8 .Lschedule_go: @@ -651,7 +659,7 @@ _vpaes_schedule_core: mov x0, #10 // mov $10, %esi .Loop_schedule_128: - sub x0, x0, #1 // dec %esi + sub x0, x0, #1 // dec %esi bl _vpaes_schedule_round cbz x0, .Lschedule_mangle_last bl _vpaes_schedule_mangle // write output @@ -674,8 +682,8 @@ _vpaes_schedule_core: // .align 4 .Lschedule_192: - sub x0, x0, #8 - ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + sub PTR(0), PTR(0), #8 + ld1 {v0.16b}, [PTR(0)] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) bl _vpaes_schedule_transform // input transform mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 @@ -683,7 +691,7 @@ _vpaes_schedule_core: mov x0, #4 // mov $4, %esi .Loop_schedule_192: - sub x0, x0, #1 // dec %esi + sub x0, x0, #1 // dec %esi bl _vpaes_schedule_round ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 bl _vpaes_schedule_mangle // save key n @@ -707,12 +715,12 @@ _vpaes_schedule_core: // .align 4 .Lschedule_256: - ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + ld1 {v0.16b}, [PTR(0)] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) bl _vpaes_schedule_transform // input transform mov x0, #7 // mov $7, %esi .Loop_schedule_256: - sub x0, x0, #1 // dec %esi + sub x0, x0, #1 // dec %esi bl _vpaes_schedule_mangle // output low result mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 @@ -744,21 +752,21 @@ _vpaes_schedule_core: .align 4 .Lschedule_mangle_last: // schedule last round key from xmm0 - adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + adr PTR(11), .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew cbnz w3, .Lschedule_mangle_last_dec // encrypting - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 - adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform - add x2, x2, #32 // add $32, %rdx + ld1 {v1.2d}, [PTR(8)] // vmovdqa (%r8,%r10),%xmm1 + adr PTR(11), .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add PTR(2), PTR(2), #32 // add $32, %rdx tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute .Lschedule_mangle_last_dec: - ld1 {v20.2d,v21.2d}, [x11] // reload constants - sub x2, x2, #16 // add $-16, %rdx + ld1 {v20.2d,v21.2d}, [PTR(11)] // reload constants + sub PTR(2), PTR(2), #16 // add $-16, %rdx eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 bl _vpaes_schedule_transform // output transform - st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + st1 {v0.2d}, [PTR(2)] // vmovdqu %xmm0, (%rdx) # save last key // cleanup eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 @@ -769,7 +777,7 @@ _vpaes_schedule_core: eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 - ldp x29, x30, [sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size _vpaes_schedule_core,.-_vpaes_schedule_core @@ -922,12 +930,12 @@ _vpaes_schedule_mangle: // encrypting eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 - add x2, x2, #16 // add $16, %rdx + add PTR(2), PTR(2), #16 // add $16, %rdx tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + ld1 {v1.2d}, [PTR(8)] // vmovdqa (%r8,%r10), %xmm1 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 b .Lschedule_mangle_both @@ -965,17 +973,21 @@ _vpaes_schedule_mangle: tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 // vmovdqa 0x70(%r11), %xmm4 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + ld1 {v1.2d}, [PTR(8)] // vmovdqa (%r8,%r10), %xmm1 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 - sub x2, x2, #16 // add $-16, %rdx + sub PTR(2), PTR(2), #16 // add $-16, %rdx .Lschedule_mangle_both: tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - add x8, x8, #64-16 // add $-16, %r8 + add PTR(8), PTR(8), #64-16 // add $-16, %r8 +#ifdef __CHERI_PURE_CAPABILITY__ + alignd c8, c8, #6 // and $0x30, %r8 +#else and x8, x8, #~(1<<6) // and $0x30, %r8 - st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) +#endif + st1 {v3.2d}, [PTR(2)] // vmovdqu %xmm3, (%rdx) ret .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle @@ -984,21 +996,21 @@ _vpaes_schedule_mangle: .align 4 vpaes_set_encrypt_key: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so lsr w9, w1, #5 // shr $5,%eax add w9, w9, #5 // $5,%eax - str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + str w9, [PTR(2),#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; mov w3, #0 // mov $0,%ecx mov x8, #0x30 // mov $0x30,%r8d bl _vpaes_schedule_core eor x0, x0, x0 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key @@ -1008,16 +1020,16 @@ vpaes_set_encrypt_key: .align 4 vpaes_set_decrypt_key: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so lsr w9, w1, #5 // shr $5,%eax add w9, w9, #5 // $5,%eax - str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + str w9, [PTR(2),#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; lsl w9, w9, #4 // shl $4,%eax - add x2, x2, #16 // lea 16(%rdx,%rax),%rdx - add x2, x2, x9 + add PTR(2), PTR(2), #16 // lea 16(%rdx,%rax),%rdx + add PTR(2), PTR(2), x9 mov w3, #1 // mov $1,%ecx lsr w8, w1, #1 // shr $1,%r8d @@ -1025,8 +1037,8 @@ vpaes_set_decrypt_key: eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 bl _vpaes_schedule_core - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key @@ -1039,28 +1051,28 @@ vpaes_cbc_encrypt: cmp w5, #0 // check direction b.eq vpaes_cbc_decrypt - stp x29,x30,[sp,#-16]! - add x29,sp,#0 + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 mov x17, x2 // reassign - mov x2, x3 // reassign + mov PTR(2), PTR(3) // reassign - ld1 {v0.16b}, [x4] // load ivec + ld1 {v0.16b}, [PTR(4)] // load ivec bl _vpaes_encrypt_preheat b .Lcbc_enc_loop .align 4 .Lcbc_enc_loop: - ld1 {v7.16b}, [x0],#16 // load input + ld1 {v7.16b}, [PTR(0)],#16 // load input eor v7.16b, v7.16b, v0.16b // xor with ivec bl _vpaes_encrypt_core - st1 {v0.16b}, [x1],#16 // save output + st1 {v0.16b}, [PTR(1)],#16 // save output subs x17, x17, #16 b.hi .Lcbc_enc_loop - st1 {v0.16b}, [x4] // write ivec + st1 {v0.16b}, [PTR(4)] // write ivec - ldp x29,x30,[sp],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) .Lcbc_abort: AARCH64_VALIDATE_LINK_REGISTER ret @@ -1071,47 +1083,47 @@ vpaes_cbc_encrypt: vpaes_cbc_decrypt: // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to // only from vpaes_cbc_encrypt which has already signed the return address. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so + stp d10,d11,[PTRN(sp),#-16]! + stp d12,d13,[PTRN(sp),#-16]! + stp d14,d15,[PTRN(sp),#-16]! mov x17, x2 // reassign - mov x2, x3 // reassign - ld1 {v6.16b}, [x4] // load ivec + mov PTR(2), PTR(3) // reassign + ld1 {v6.16b}, [PTR(4)] // load ivec bl _vpaes_decrypt_preheat tst x17, #16 b.eq .Lcbc_dec_loop2x - ld1 {v7.16b}, [x0], #16 // load input + ld1 {v7.16b}, [PTR(0)], #16 // load input bl _vpaes_decrypt_core eor v0.16b, v0.16b, v6.16b // xor with ivec orr v6.16b, v7.16b, v7.16b // next ivec value - st1 {v0.16b}, [x1], #16 + st1 {v0.16b}, [PTR(1)], #16 subs x17, x17, #16 b.ls .Lcbc_dec_done .align 4 .Lcbc_dec_loop2x: - ld1 {v14.16b,v15.16b}, [x0], #32 + ld1 {v14.16b,v15.16b}, [PTR(0)], #32 bl _vpaes_decrypt_2x eor v0.16b, v0.16b, v6.16b // xor with ivec eor v1.16b, v1.16b, v14.16b orr v6.16b, v15.16b, v15.16b - st1 {v0.16b,v1.16b}, [x1], #32 + st1 {v0.16b,v1.16b}, [PTR(1)], #32 subs x17, x17, #32 b.hi .Lcbc_dec_loop2x .Lcbc_dec_done: - st1 {v6.16b}, [x4] + st1 {v6.16b}, [PTR(4)] - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d14,d15,[PTRN(sp)],#16 + ldp d12,d13,[PTRN(sp)],#16 + ldp d10,d11,[PTRN(sp)],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt @@ -1120,39 +1132,39 @@ vpaes_cbc_decrypt: .align 4 vpaes_ecb_encrypt: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so + stp d10,d11,[PTRN(sp),#-16]! + stp d12,d13,[PTRN(sp),#-16]! + stp d14,d15,[PTRN(sp),#-16]! mov x17, x2 - mov x2, x3 + mov PTR(2), PTR(3) bl _vpaes_encrypt_preheat tst x17, #16 b.eq .Lecb_enc_loop - ld1 {v7.16b}, [x0],#16 + ld1 {v7.16b}, [PTR(0)],#16 bl _vpaes_encrypt_core - st1 {v0.16b}, [x1],#16 + st1 {v0.16b}, [PTR(1)],#16 subs x17, x17, #16 b.ls .Lecb_enc_done .align 4 .Lecb_enc_loop: - ld1 {v14.16b,v15.16b}, [x0], #32 + ld1 {v14.16b,v15.16b}, [PTR(0)], #32 bl _vpaes_encrypt_2x - st1 {v0.16b,v1.16b}, [x1], #32 + st1 {v0.16b,v1.16b}, [PTR(1)], #32 subs x17, x17, #32 b.hi .Lecb_enc_loop .Lecb_enc_done: - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d14,d15,[PTRN(sp)],#16 + ldp d12,d13,[PTRN(sp)],#16 + ldp d10,d11,[PTRN(sp)],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt @@ -1162,39 +1174,39 @@ vpaes_ecb_encrypt: .align 4 vpaes_ecb_decrypt: AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! + stp PTR(29),PTR(30),[PTRN(sp),#-(2*PTR_WIDTH)]! + add PTR(29),PTRN(sp),#0 + stp d8,d9,[PTRN(sp),#-16]! // ABI spec says so + stp d10,d11,[PTRN(sp),#-16]! + stp d12,d13,[PTRN(sp),#-16]! + stp d14,d15,[PTRN(sp),#-16]! mov x17, x2 - mov x2, x3 + mov PTR(2), PTR(3) bl _vpaes_decrypt_preheat tst x17, #16 b.eq .Lecb_dec_loop - ld1 {v7.16b}, [x0],#16 + ld1 {v7.16b}, [PTR(0)],#16 bl _vpaes_encrypt_core - st1 {v0.16b}, [x1],#16 + st1 {v0.16b}, [PTR(1)],#16 subs x17, x17, #16 b.ls .Lecb_dec_done .align 4 .Lecb_dec_loop: - ld1 {v14.16b,v15.16b}, [x0], #32 + ld1 {v14.16b,v15.16b}, [PTR(0)], #32 bl _vpaes_decrypt_2x - st1 {v0.16b,v1.16b}, [x1], #32 + st1 {v0.16b,v1.16b}, [PTR(1)], #32 subs x17, x17, #32 b.hi .Lecb_dec_loop .Lecb_dec_done: - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 + ldp d14,d15,[PTRN(sp)],#16 + ldp d12,d13,[PTRN(sp)],#16 + ldp d10,d11,[PTRN(sp)],#16 + ldp d8,d9,[PTRN(sp)],#16 + ldp PTR(29),PTR(30),[PTRN(sp)],#(2*PTR_WIDTH) AARCH64_VALIDATE_LINK_REGISTER ret .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt diff --git a/sys/crypto/openssl/arm_arch.h b/sys/crypto/openssl/arm_arch.h index 7bedb385d971..d8c9ae6d7796 100644 --- a/sys/crypto/openssl/arm_arch.h +++ b/sys/crypto/openssl/arm_arch.h @@ -177,6 +177,22 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; .popsection; # endif + /* + * Support macros for Morello + */ + +# if __ARM_ARCH__>=8 +# ifdef __CHERI_PURE_CAPABILITY__ +# define PTR_WIDTH 16 +# define PTR(n) c ## n +# define PTRN(n) c ## n +# else +# define PTR_WIDTH 8 +# define PTR(n) x ## n +# define PTRN(n) n +# endif +# endif + # endif /* defined __ASSEMBLER__ */ #endif From ea9aa43ce31fd099cc94f55b39908eba90f4f704 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Wed, 1 May 2024 10:16:56 -0700 Subject: [PATCH 16/19] libcrypto: Enable build of assembly routines for Morello purecap --- secure/lib/libcrypto/Makefile | 7 ++++--- secure/lib/libcrypto/Makefile.common | 3 +-- secure/lib/libcrypto/modules/fips/Makefile | 7 ++++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/secure/lib/libcrypto/Makefile b/secure/lib/libcrypto/Makefile index dc701d90451e..d79320761b82 100644 --- a/secure/lib/libcrypto/Makefile +++ b/secure/lib/libcrypto/Makefile @@ -25,7 +25,7 @@ SRCS+= param_build.c param_build_set.c params.c params_dup.c params_from_text.c SRCS+= passphrase.c provider.c provider_child.c provider_conf.c provider_core.c provider_predefined.c punycode.c self_test_core.c sparse_array.c threads_lib.c threads_none.c threads_pthread.c trace.c uid.c .if defined(ASM_aarch64) SRCS+= arm64cpuid.S armcap.c -ACFLAGS.arm64cpuid.S= -march=armv8-a+crypto +ACFLAGS.arm64cpuid.S= ${CFLAGS_CRYPTO} .elif defined(ASM_amd64) SRCS+= x86_64cpuid.S .elif defined(ASM_arm) @@ -46,7 +46,7 @@ SRCS+= mem_clr.c SRCS+= aes_cfb.c aes_ecb.c aes_ige.c aes_misc.c aes_ofb.c aes_wrap.c .if defined(ASM_aarch64) SRCS+= aes_cbc.c aes_core.c aesv8-armx.S vpaes-armv8.S -ACFLAGS.aesv8-armx.S= -march=armv8-a+crypto +ACFLAGS.aesv8-armx.S= ${CFLAGS_CRYPTO} .elif defined(ASM_amd64) SRCS+= aes-x86_64.S aesni-mb-x86_64.S aesni-sha1-x86_64.S SRCS+= aesni-sha256-x86_64.S aesni-x86_64.S bsaes-x86_64.S vpaes-x86_64.S @@ -295,7 +295,8 @@ SRCS+= cbc128.c ccm128.c cfb128.c ctr128.c cts128.c gcm128.c ocb128.c SRCS+= ofb128.c siv128.c wrap128.c xts128.c .if defined(ASM_aarch64) SRCS+= ghashv8-armx.S aes-gcm-armv8_64.S -ACFLAGS.ghashv8-armx.S= -march=armv8-a+crypto +ACFLAGS.ghashv8-armx.S= ${CFLAGS_CRYPTO} +ACFLAGS.aes-gcm-armv8_64.S= ${CFLAGS_CRYPTO} .elif defined(ASM_amd64) SRCS+= aesni-gcm-x86_64.S ghash-x86_64.S .elif defined(ASM_arm) diff --git a/secure/lib/libcrypto/Makefile.common b/secure/lib/libcrypto/Makefile.common index 8c8fcdb8a54d..0a4b7207ff13 100644 --- a/secure/lib/libcrypto/Makefile.common +++ b/secure/lib/libcrypto/Makefile.common @@ -10,9 +10,7 @@ CFLAGS+= -DB_ENDIAN .ifndef WITHOUT_AUTO_ASM .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "arm" || ${MACHINE_CPUARCH} == "i386" -.if !${MACHINE_ARCH:Maarch64*c*} ASM_${MACHINE_CPUARCH}= -.endif .elif ${MACHINE_ARCH} == "powerpc" || ${MACHINE_ARCH} == "powerpc64" || \ ${MACHINE_ARCH} == "powerpc64le" ASM_${MACHINE_ARCH}= @@ -22,6 +20,7 @@ ASM_${MACHINE_ARCH}= .if defined(ASM_${MACHINE_CPUARCH}) || defined(ASM_${MACHINE_ARCH}) CFLAGS+= -DOPENSSL_CPUID_OBJ .if defined(ASM_aarch64) +CFLAGS_CRYPTO= ${CFLAGS:M-march=*:S/^$/-march=armv8-a/W:[-1]}+crypto CFLAGS+= -DOPENSSL_BN_ASM_MONT CFLAGS+= -DSHA1_ASM -DSHA256_ASM -DSHA512_ASM CFLAGS+= -DKECCAK1600_ASM diff --git a/secure/lib/libcrypto/modules/fips/Makefile b/secure/lib/libcrypto/modules/fips/Makefile index 8843cb9717c9..daa65b5c556a 100644 --- a/secure/lib/libcrypto/modules/fips/Makefile +++ b/secure/lib/libcrypto/modules/fips/Makefile @@ -14,7 +14,7 @@ SRCS+= provider_core.c provider_predefined.c \ SRCS+= cpuid.c ctype.c .if defined(ASM_aarch64) SRCS+= arm64cpuid.S armcap.c -ACFLAGS.arm64cpuid.S= -march=armv8-a+crypto +ACFLAGS.arm64cpuid.S= ${CFLAGS_CRYPTO} .elif defined(ASM_amd64) SRCS+= x86_64cpuid.S .elif defined(ASM_arm) @@ -35,7 +35,7 @@ SRCS+= mem_clr.c SRCS+= aes_cfb.c aes_ecb.c aes_ige.c aes_misc.c aes_ofb.c aes_wrap.c .if defined(ASM_aarch64) SRCS+= aes_cbc.c aes_core.c aesv8-armx.S vpaes-armv8.S -ACFLAGS.aesv8-armx.S= -march=armv8-a+crypto +ACFLAGS.aesv8-armx.S= ${CFLAGS_CRYPTO} .elif defined(ASM_amd64) SRCS+= aes-x86_64.S aesni-mb-x86_64.S aesni-sha1-x86_64.S SRCS+= aesni-sha256-x86_64.S aesni-x86_64.S bsaes-x86_64.S vpaes-x86_64.S @@ -155,7 +155,8 @@ SRCS+= cbc128.c ctr128.c cfb128.c ofb128.c gcm128.c ccm128.c xts128.c SRCS+= wrap128.c .if defined(ASM_aarch64) SRCS+= ghashv8-armx.S aes-gcm-armv8_64.S -ACFLAGS.ghashv8-armx.S= -march=armv8-a+crypto +ACFLAGS.ghashv8-armx.S= ${CFLAGS_CRYPTO} +ACFLAGS.aes-gcm-armv8_64.S= ${CFLAGS_CRYPTO} .elif defined(ASM_amd64) SRCS+= aesni-gcm-x86_64.S ghash-x86_64.S .elif defined(ASM_arm) From 6b62ff904c2c8b28c3f538873030f31259d8e030 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 17 May 2024 16:46:34 -0700 Subject: [PATCH 17/19] armv8crypto,ossl: Use correct -march for Morello --- sys/modules/armv8crypto/Makefile | 8 +++++--- sys/modules/ossl/Makefile | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sys/modules/armv8crypto/Makefile b/sys/modules/armv8crypto/Makefile index 74ea77fbb761..6fd1fa079989 100644 --- a/sys/modules/armv8crypto/Makefile +++ b/sys/modules/armv8crypto/Makefile @@ -9,26 +9,28 @@ OBJS+= armv8_crypto_wrap.o aesv8-armx.o ghashv8-armx.o CFLAGS+=-I${SRCTOP}/sys/crypto/openssl +CFLAGS_CRYPTO= ${CFLAGS:M-march=*:S/^$/-march=armv8-a/W:[-1]}+crypto + # Remove -nostdinc so we can get the intrinsics. armv8_crypto_wrap.o: armv8_crypto_wrap.c ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc:N-mgeneral-regs-only} \ -I${SRCTOP}/sys/crypto/armv8 \ ${WERROR} ${PROF} \ - -march=armv8-a+crypto ${.IMPSRC} + ${CFLAGS_CRYPTO} ${.IMPSRC} ${CTFCONVERT_CMD} aesv8-armx.o: aesv8-armx.S ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc:N-mgeneral-regs-only} \ -I${SRCTOP}/sys/crypto/armv8 \ ${WERROR} ${PROF} \ - -march=armv8-a+crypto ${.IMPSRC} + ${CFLAGS_CRYPTO} ${.IMPSRC} ${CTFCONVERT_CMD} ghashv8-armx.o: ghashv8-armx.S ${CC} -c ${CFLAGS:C/^-O2$/-O3/:N-nostdinc:N-mgeneral-regs-only} \ -I${SRCTOP}/sys/crypto/armv8 \ ${WERROR} ${PROF} \ - -march=armv8-a+crypto ${.IMPSRC} + ${CFLAGS_CRYPTO} ${.IMPSRC} ${CTFCONVERT_CMD} armv8_crypto_wrap.o: armv8_crypto.h diff --git a/sys/modules/ossl/Makefile b/sys/modules/ossl/Makefile index 9777e0bcfacc..e42c33a1753a 100644 --- a/sys/modules/ossl/Makefile +++ b/sys/modules/ossl/Makefile @@ -73,7 +73,7 @@ ${SRCS.aarch64:M*.S:S/S/o/}: ${.TARGET:R}.S # Clang doesn't recognize "aes*" instructions without -march set. aesv8-armx.o: aesv8-armx.S ${CC} -c ${CFLAGS:N-mgeneral-regs-only} ${WERROR} ${PROF} \ - -march=armv8-a+crypto ${.IMPSRC} + ${CFLAGS:M-march=*:S/^$/-march=armv8-a/W:[-1]}+crypto ${.IMPSRC} ${CTFCONVERT_CMD} OBJS.aarch64= aesv8-armx.o From 40aa224ff710440bc22c0491ec727412f27bfb41 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 17 May 2024 16:49:45 -0700 Subject: [PATCH 18/19] armv8crypto,ossl: Enable build for Morello purecap --- sys/arm64/conf/GENERIC-MORELLO-PURECAP | 2 -- sys/modules/Makefile | 3 --- 2 files changed, 5 deletions(-) diff --git a/sys/arm64/conf/GENERIC-MORELLO-PURECAP b/sys/arm64/conf/GENERIC-MORELLO-PURECAP index 887e26b64eb9..b53efaef7185 100644 --- a/sys/arm64/conf/GENERIC-MORELLO-PURECAP +++ b/sys/arm64/conf/GENERIC-MORELLO-PURECAP @@ -30,6 +30,4 @@ options CHERI_PURECAP_KERNEL nooptions PERTHREAD_SSP # Not relevant in purecap -nodevice armv8crypto # No purecap assembly - nodevice dpaa2 # Stores pointers in rman_res_t diff --git a/sys/modules/Makefile b/sys/modules/Makefile index aef5bb84613b..31a562efc4b9 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -574,10 +574,8 @@ _vmware= vmware .if ${MACHINE_CPUARCH} == "aarch64" || ${MACHINE_CPUARCH} == "amd64" || \ ${MACHINE_CPUARCH} == "i386" || ${MACHINE_ARCH} == "armv7" -.if !${MACHINE_ABI:Mpurecap} _ossl= ossl .endif -.endif # MAC framework .if ${KERN_OPTS:MMAC} || defined(ALL_MODULES) @@ -917,7 +915,6 @@ _malo= malo # (e.g. we don't want purecap kernels penalised by not having accelerated # crypto just because we haven't ported the assembly). .if ${MACHINE_CPU:Mcheri} -BROKEN_MODULES+=armv8crypto BROKEN_MODULES+=dpaa2 BROKEN_MODULES+=ena BROKEN_MODULES+=linprocfs linux64 linux_common From 6af15334331203b8bd476c6b67a589318d112349 Mon Sep 17 00:00:00 2001 From: John Baldwin Date: Fri, 5 Jul 2024 09:49:22 -0700 Subject: [PATCH 19/19] HACK: Workaround Morello LLVM bug by moving ecp_nistz256_precomputed to rodata --- crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl | 2 ++ sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S | 2 ++ 2 files changed, 4 insertions(+) diff --git a/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl index 9a9114e37bbb..3a42a87c8fe3 100755 --- a/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl +++ b/crypto/openssl/crypto/ec/asm/ecp_nistz256-armv8.pl @@ -79,6 +79,7 @@ die "insane number of elements" if ($#arr != 64*16*37-1); $code.=<<___; +.rodata .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,%object .align 12 @@ -103,6 +104,7 @@ } $code.=<<___; .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.text .align 5 .Lpoly: .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 diff --git a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S index 48336c0c9891..0ea242b5265e 100644 --- a/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S +++ b/sys/crypto/openssl/aarch64/ecp_nistz256-armv8.S @@ -2,6 +2,7 @@ #include "arm_arch.h" .text +.section .rodata .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,%object .align 12 @@ -2375,6 +2376,7 @@ ecp_nistz256_precomputed: .byte 0xec,0xf0,0x42,0x88,0xd0,0x81,0x51,0xf9,0x1b,0xbc,0x43,0xa4,0x37,0xf1,0xd7,0x90,0x21,0x7e,0xa0,0x3e,0x63,0xfb,0x21,0xfa,0x12,0xfb,0xde,0xc7,0xbf,0xb3,0x58,0xe7,0x76,0x42,0x20,0x01,0x3d,0x66,0x80,0xf1,0xb8,0xaf,0xfa,0x7d,0x96,0x89,0x36,0x48,0x95,0xd9,0x6e,0x6d,0xe6,0x4f,0xff,0x2a,0x47,0x61,0xf2,0x04,0xb7,0x83,0x14,0xce .byte 0x0a,0x3c,0x73,0x17,0x50,0x88,0x03,0x25,0x4a,0xe3,0x13,0x55,0x8b,0x7e,0x50,0x38,0xfc,0x14,0x0b,0x04,0x8e,0xa8,0x5b,0xd6,0x72,0x20,0x60,0xe9,0xaa,0x22,0x82,0x11,0xc6,0xc4,0xd7,0xb9,0xc8,0x0c,0x7e,0x05,0xfb,0x90,0xe4,0x9c,0x28,0x89,0x29,0x99,0x63,0x4d,0xec,0x7b,0x50,0xbd,0xd8,0xa3,0x5b,0x50,0x77,0x19,0x81,0x92,0xce,0x82 .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed +.text .align 5 .Lpoly: .quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001