From 7f140211801906630d411eb3b494135a88575123 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Thu, 19 Sep 2024 17:16:36 +0200 Subject: [PATCH 1/3] lib/libexec/rtld-elf/aarch64: pass hwcap, hwcap2 to ifunc resolvers This allows ifunc resolvers to quickly check for the availability of common AArch64 extensions. Event: EuroBSDcon 2024 Approved by: kib --- libexec/rtld-elf/aarch64/reloc.c | 7 +++++++ libexec/rtld-elf/aarch64/rtld_machdep.h | 3 ++- sys/arm64/include/ifunc.h | 5 +++-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/libexec/rtld-elf/aarch64/reloc.c b/libexec/rtld-elf/aarch64/reloc.c index 3687c9385326..444706b35528 100644 --- a/libexec/rtld-elf/aarch64/reloc.c +++ b/libexec/rtld-elf/aarch64/reloc.c @@ -451,10 +451,17 @@ reloc_jmpslot(Elf_Addr *where, Elf_Addr target, return (target); } +unsigned long elf_hwcap, elf_hwcap2; + void ifunc_init(Elf_Auxinfo *aux_info[__min_size(AT_COUNT)] __unused) { + if (aux_info[AT_HWCAP] != NULL) + elf_hwcap = aux_info[AT_HWCAP]->a_un.a_val; + + if (aux_info[AT_HWCAP2] != NULL) + elf_hwcap2 = aux_info[AT_HWCAP2]->a_un.a_val; } /* diff --git a/libexec/rtld-elf/aarch64/rtld_machdep.h b/libexec/rtld-elf/aarch64/rtld_machdep.h index 3cc1339fcad4..071c962947a6 100644 --- a/libexec/rtld-elf/aarch64/rtld_machdep.h +++ b/libexec/rtld-elf/aarch64/rtld_machdep.h @@ -72,9 +72,10 @@ Elf_Addr reloc_jmpslot(Elf_Addr *where, Elf_Addr target, * no arguments are passed in, and if this changes later will be able to * compare the argument with 0 to see if it is set. */ +extern unsigned long elf_hwcap, elf_hwcap2; #define call_ifunc_resolver(ptr) \ (((Elf_Addr (*)(uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, \ - uint64_t, uint64_t, uint64_t))ptr)(0, 0, 0, 0, 0, 0, 0, 0)) + uint64_t, uint64_t, uint64_t))ptr)(elf_hwcap, elf_hwcap2, 0, 0, 0, 0, 0, 0)) #define round(size, align) \ (((size) + (align) - 1) & ~((align) - 1)) diff --git a/sys/arm64/include/ifunc.h b/sys/arm64/include/ifunc.h index de452ad34c8f..48a4f45afcc7 100644 --- a/sys/arm64/include/ifunc.h +++ b/sys/arm64/include/ifunc.h @@ -39,10 +39,11 @@ uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, \ uint64_t))args __used; \ qual ret_type name args __attribute__((ifunc(#name "_resolver"))); \ - static ret_type (*name##_resolver(uint64_t _arg1 __unused, \ - uint64_t _arg2 __unused, uint64_t _arg3 __unused, \ + static ret_type (*name##_resolver(uint64_t elf_hwcap __unused, \ + uint64_t elf_hwcap2 __unused, uint64_t _arg3 __unused, \ uint64_t _arg4 __unused, uint64_t _arg5 __unused, \ uint64_t _arg6 __unused, uint64_t _arg7 __unused, \ uint64_t _arg8 __unused))args #endif + -- 2.46.0 From 83c86e6354264327f7d319ab56fa9b62c08d028c Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Thu, 19 Sep 2024 16:40:37 +0200 Subject: [PATCH 2/3] lib/libc/aarch64/string: add ASIMD-enhanced timingsafe_bcmp implementation A straightforward port of the amd64 implementation. Event: EuroBSDcon 2024 --- lib/libc/aarch64/string/Makefile.inc | 1 + lib/libc/aarch64/string/timingsafe_bcmp.S | 113 ++++++++++++++++++++++ 2 files changed, 114 insertions(+) create mode 100644 lib/libc/aarch64/string/timingsafe_bcmp.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 767a29805c63..70436dc8480d 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -32,6 +32,7 @@ MDSRCS+= \ strncat.c \ strlcat.c \ strlen.S \ + timingsafe_bcmp.S \ bcopy.c \ bzero.c diff --git a/lib/libc/aarch64/string/timingsafe_bcmp.S b/lib/libc/aarch64/string/timingsafe_bcmp.S new file mode 100644 index 000000000000..baa5c6f0940c --- /dev/null +++ b/lib/libc/aarch64/string/timingsafe_bcmp.S @@ -0,0 +1,113 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Robert Clausecker + */ + +#include + +ENTRY(timingsafe_bcmp) + cmp x2, #32 // at least 33 bytes to process? + bhi .Lgt32 + + cmp x2, #16 // at least 17 bytes to process? + bhi .L1732 + + cmp x2, #8 // at least 9 bytes to process? + bhi .L0916 + + cmp x2, #4 // at least 5 bytes to process? + bhi .L0508 + + cmp x2, #2 // at least 3 bytes to process? + bhi .L0304 + + cbnz x2, .L0102 // buffer empty? + + mov w0, #0 // empty buffer always matches + ret + +.L0102: ldrb w3, [x0] // load first bytes + ldrb w4, [x1] + sub x2, x2, #1 + ldrb w5, [x0, x2] // load last bytes + ldrb w6, [x1, x2] + eor w3, w3, w4 + eor w5, w5, w6 + orr w0, w3, w5 + ret + +.L0304: ldrh w3, [x0] // load first halfwords + ldrh w4, [x1] + sub x2, x2, #2 + ldrh w5, [x0, x2] // load last halfwords + ldrh w6, [x1, x2] + eor w3, w3, w4 + eor w5, w5, w6 + orr w0, w3, w5 + ret + +.L0508: ldr w3, [x0] // load first words + ldr w4, [x1] + sub x2, x2, #4 + ldr w5, [x0, x2] // load last words + ldr w6, [x1, x2] + eor w3, w3, w4 + eor w5, w5, w6 + orr w0, w3, w5 + ret + +.L0916: ldr x3, [x0] + ldr x4, [x1] + sub x2, x2, #8 + ldr x5, [x0, x2] + ldr x6, [x1, x2] + eor x3, x3, x4 + eor x5, x5, x6 + orr x0, x3, x5 + orr x0, x0, x0, lsr #32 // ensure low 32 bits are nonzero iff mismatch + ret + +.L1732: ldr q0, [x0] + ldr q1, [x1] + sub x2, x2, #16 + ldr q2, [x0, x2] + ldr q3, [x1, x2] + eor v0.16b, v0.16b, v1.16b + eor v2.16b, v2.16b, v3.16b + orr v0.16b, v0.16b, v2.16b + umaxv s0, v0.4s // get a nonzero word if any + mov w0, v0.s[0] + ret + + /* more than 32 bytes: process buffer in a loop */ +.Lgt32: ldp q0, q1, [x0], #32 + ldp q2, q3, [x1], #32 + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + orr v4.16b, v0.16b, v1.16b + subs x2, x2, #64 // enough left for another iteration? + bls .Ltail + +0: ldp q0, q1, [x0], #32 + ldp q2, q3, [x1], #32 + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + orr v0.16b, v0.16b, v1.16b + orr v4.16b, v4.16b, v0.16b + subs x2, x2, #32 + bhi 0b + + /* process last 32 bytes */ +.Ltail: add x0, x0, x2 // point to the last 32 bytes in the buffer + add x1, x1, x2 + ldp q0, q1, [x0] + ldp q2, q3, [x1] + eor v0.16b, v0.16b, v2.16b + eor v1.16b, v1.16b, v3.16b + orr v0.16b, v0.16b, v1.16b + orr v4.16b, v4.16b, v0.16b + umaxv s0, v4.4s // get a nonzero word if any + mov w0, v0.s[0] + ret +END(timingsafe_bcmp) -- 2.46.0 From 924746cb8d5e6485536f9f1033c393c1b30cee29 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Fri, 20 Sep 2024 11:31:22 +0200 Subject: [PATCH 3/3] lib/libc/aarch64/string: add timingsafe_memcmp() assembly implementation A port of the amd64 implementation with some slight changes due to differences in instructions provided by aarch64. No ASIMD for the same reason as the amd64 code: it's just not particularly suitable for this application. Event: EuroBSDcon 2024 --- lib/libc/aarch64/string/Makefile.inc | 1 + lib/libc/aarch64/string/timingsafe_memcmp.S | 117 ++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 lib/libc/aarch64/string/timingsafe_memcmp.S diff --git a/lib/libc/aarch64/string/Makefile.inc b/lib/libc/aarch64/string/Makefile.inc index 70436dc8480d..ed5bfc916aa9 100644 --- a/lib/libc/aarch64/string/Makefile.inc +++ b/lib/libc/aarch64/string/Makefile.inc @@ -33,6 +33,7 @@ MDSRCS+= \ strlcat.c \ strlen.S \ timingsafe_bcmp.S \ + timingsafe_memcmp.S \ bcopy.c \ bzero.c diff --git a/lib/libc/aarch64/string/timingsafe_memcmp.S b/lib/libc/aarch64/string/timingsafe_memcmp.S new file mode 100644 index 000000000000..28fdd911a387 --- /dev/null +++ b/lib/libc/aarch64/string/timingsafe_memcmp.S @@ -0,0 +1,117 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2024 Robert Clausecker + */ + +#include + +ENTRY(timingsafe_memcmp) + cmp x2, #16 // at least 17 bytes to process? + bhi .Lgt16 + + cmp x2, #8 // at least 9 bytes to process? + bhi .L0916 + + cmp x2, #4 // at least 5 bytes to process? + bhi .L0508 + + cmp x2, #2 // at least 3 bytes to process? + bhi .L0304 + + cbnz x2, .L0102 // buffer empty? + + mov w0, #0 // empty buffer always matches + ret + +.L0102: ldrb w3, [x0] // load first bytes + ldrb w4, [x1] + sub x2, x2, #1 + ldrb w5, [x0, x2] // load last bytes + ldrb w6, [x1, x2] + bfi w5, w3, #8, #8 // join bytes in big endian + bfi w6, w4, #8, #8 + sub w0, w5, w6 + ret + + +.L0304: ldrh w3, [x0] // load first halfwords + ldrh w4, [x1] + sub x2, x2, #2 + ldrh w5, [x0, x2] // load last halfwords + ldrh w6, [x1, x2] + bfi w3, w5, #16, #16 // join halfwords in little endian + bfi w4, w6, #16, #16 + rev w3, w3 // swap word order + rev w4, w4 + cmp w3, w4 + csetm w0, lo // w0 = w3 >= w4 ? 0 : -1 + csinc w0, w0, wzr, ls // w0 = w3 <=> w4 ? 1 : 0 : -1 + ret + +.L0508: ldr w3, [x0] // load first words + ldr w4, [x1] + sub x2, x2, #4 + ldr w5, [x0, x2] // load last words + ldr w6, [x1, x2] + bfi x3, x5, #32, #32 // join words in little endian + bfi x4, x6, #32, #32 + rev x3, x3 // swap word order + rev x4, x4 + cmp x3, x4 + csetm w0, lo // x0 = x3 >= w4 ? 0 : -1 + csinc w0, w0, wzr, ls // x0 = x3 <=> w4 ? 1 : 0 : -1 + ret + +.L0916: ldr x3, [x0] + ldr x4, [x1] + sub x2, x2, #8 + ldr x5, [x0, x2] + ldr x6, [x1, x2] + cmp x3, x4 // mismatch in first pair? + csel x3, x3, x5, ne // use second pair if first pair equal + csel x4, x4, x6, ne + rev x3, x3 + rev x4, x4 + cmp x3, x4 + csetm w0, lo + csinc w0, w0, wzr, ls + ret + + /* more than 16 bytes: process buffer in a loop */ +.Lgt16: ldp x3, x4, [x0], #16 + ldp x5, x6, [x1], #16 + cmp x3, x5 // mismatch in first pair? + csel x3, x3, x4, ne // use second pair if first pair equal + csel x5, x5, x6, ne + subs x2, x2, #32 + bls .Ltail + +0: ldp x4, x7, [x0], #16 + ldp x6, x8, [x1], #16 + cmp x4, x6 // mismatch in first pair? + csel x4, x4, x7, ne // if not, try second pair + csel x6, x6, x8, ne + cmp x3, x5 // was there a mismatch previously? + csel x3, x3, x4, ne // apply new pair if there was not + csel x5, x5, x6, ne + subs x2, x2, #16 + bhi 0b + +.Ltail: add x0, x0, x2 + add x1, x1, x2 + ldp x4, x7, [x0] + ldp x6, x8, [x1] + cmp x4, x6 // mismatch in first pair? + csel x4, x4, x7, ne // if not, try second pair + csel x6, x6, x8, ne + cmp x3, x5 // was there a mismatch previously? + csel x3, x3, x4, ne // apply new pair if there was not + csel x5, x5, x6, ne + rev x3, x3 + rev x5, x5 + cmp x3, x5 + csetm w0, lo + csinc w0, w0, wzr, ls + ret +END(timingsafe_bcmp) -- 2.46.0