From 7e3bc53ae118a8c987036d9ae60e72d9f63b809e Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Sat, 16 Sep 2023 01:29:39 -0400 Subject: [PATCH 01/30] lib/libc/amd64/string/strcmp.S: add baseline implementation This is the most complicated one so far. The basic idea is to process the bulk of the string in aligned blocks of 16 bytes such that one string runs ahead and the other runs behind. The string that runs ahead is checked for NUL bytes, the one that runs behind is compared with the corresponding chunk of the string that runs ahead. This trades an extra load per iteration for the very complicated block-reassembly needed in the other implementations (bionic, glibc). On the flip side, we need two code paths depending on the relative alignment of the two buffers. The initial part of the string is compared directly if it is known not to cross a page boundary. Otherwise, a complex slow path to avoid crossing into unmapped memory commences. Performance-wise we beat bionic for misaligned strings (i.e. the strings do not share an alignment offset) and reach comparable performance for aligned strings. glibc is a bit better as it has a special kernel for AVX-512, where this stuff is a bit easier to do. Sponsored by: The FreeBSD Foundation Differential Revision: https://reviews.freebsd.org/D41971 Approved by: ... MFC after: 1 week --- lib/libc/amd64/string/strcmp.S | 299 ++++++++++++++++++++++++++++++++- 1 file changed, 292 insertions(+), 7 deletions(-) diff --git a/lib/libc/amd64/string/strcmp.S b/lib/libc/amd64/string/strcmp.S index 437db7eca43a..eb354bd2af82 100644 --- a/lib/libc/amd64/string/strcmp.S +++ b/lib/libc/amd64/string/strcmp.S @@ -1,14 +1,33 @@ -/* - * Written by J.T. Conklin - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S + * written by J.T. Conklin that was originally + * dedicated to the public domain. */ #include +#include + #if 0 RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $") #endif -ENTRY(strcmp) +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(strcmp) + ARCHFUNC(strcmp, scalar) + ARCHFUNC(strcmp, baseline) +ENDARCHFUNCS(strcmp) + +ARCHENTRY(strcmp, scalar) /* * Align s1 to word boundary. * Consider unrolling loop? @@ -39,7 +58,7 @@ ENTRY(strcmp) movabsq $0x8080808080808080,%r9 subq $8,%rsi - .align 4 + ALIGN_TEXT .Lword_loop: movq 8(%rdi),%rax addq $8,%rdi @@ -53,7 +72,7 @@ ENTRY(strcmp) testq %r9,%rdx je .Lword_loop - .align 4 + ALIGN_TEXT .Lbyte_loop: movb (%rdi),%al incq %rdi @@ -69,6 +88,272 @@ ENTRY(strcmp) movzbq %dl,%rdx subq %rdx,%rax ret -END(strcmp) +ARCHEND(strcmp, scalar) + +ARCHENTRY(strcmp, baseline) + /* check if either string crosses a page in the head */ + lea 15(%rdi), %r8d # end of head + lea 15(%rsi), %r9d + mov %edi, %eax + mov %esi, %edx + xor %edi, %r8d # bits that changed between first and last byte + xor %esi, %r9d + and $~0xf, %rdi # align heads to 16 bytes + and $~0xf, %rsi + or %r8d, %r9d # in either RSI or RDI + and $0xf, %eax # offset from alignment + and $0xf, %edx + pxor %xmm1, %xmm1 + test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + /* heads may cross page boundary, avoid unmapped loads */ + movdqa (%rdi), %xmm0 # load aligned heads + movdqa (%rsi), %xmm2 + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %edx, %ecx + shl %cl, %r9d # string head in XMM2 + movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -24(%rsp) + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + test %r8d, %r10d # NUL byte present in first string? + lea -40(%rsp), %r8 + cmovz %rdi, %r8 + test %r9d, %r11d # NUL byte present in second string? + lea -24(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rdx, 1), %xmm4 + jmp 1f + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rdx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + xor $0xffff, %r9d # mismatch or NUL byte? + jnz .Lhead_mismatch + + /* load head and second chunk */ + movdqa 16(%rdi), %xmm2 # load second chunks + movdqa 16(%rsi), %xmm3 + sub %rdx, %rax # is a&0xf >= b&0xf? + jb .Lswapped # if not, proceed with swapped operands + + neg %rax + movdqu 16(%rsi, %rax, 1), %xmm0 + sub %rdi, %rsi # express RSI as distance from RDI + lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string + neg %rax + pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $16, %rdi + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d + jnz .Lmismatch + add $16, %rdi # advance aligned pointers + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb 16(%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rdi + test %r8d, %r8d + jnz .Lnul_found2 + xor $0xffff, %r9d # any mismatches? + jz 0b + + sub $16, %rdi # roll back second increment + + /* a mismatch has been found between RDX and RSI */ +.Lmismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rdi, %rdx # turn RDX from offset to pointer + movzbl (%rdx, %r9, 1), %ecx + movzbl (%rdi, %r9, 1), %eax + sub %ecx, %eax # difference of the mismatching chars + ret + + /* mismatch in true heads */ +.Lhead_mismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rax, %rdi # return to true heads + add %rdx, %rsi + movzbl (%rdi, %r9, 1), %eax # mismatching characters + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + ret + +.Lnul_found2: + sub $16, %rdi # roll back second increment + + /* a NUL has been found in RSI */ +.Lnul_found: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX + xor $0xffff, %r9d # mask of mismatches + or %r8d, %r9d # NUL bytes also count as mismatches + jnz .Lmismatch + + /* + * (RDI) == (RSI) and NUL is past the string. + * Compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rdi, %rax, 1), %xmm0 + pcmpeqb (%rdi, %rsi, 1), %xmm0 + add %rdi, %rsi # restore RSI pointer + add %rax, %rdi # point RDI to chunk corresponding to (RSI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + ret + + /* + * If (a&0xf) < (b&0xf), we do the same thing but with swapped + * operands. I found that this performs slightly better than + * using conditional moves to do the swap branchless. + */ +.Lswapped: + movdqu 16(%rdi, %rax, 1), %xmm0 + sub %rsi, %rdi # express RDI as distance from RSI + lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI + neg %rax # make difference positive + pcmpeqb %xmm2, %xmm1 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $16, %rsi # advance aligned pointers + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d + jnz .Lmismatchs + add $16, %rsi + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI? + pcmpeqb 16(%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rsi + test %r8d, %r8d + jnz .Lnul_found2s + xor $0xffff, %r9d # any mismatches? + jz 0b + + sub $16, %rsi # roll back second increment + + /* a mismatch has been found between RDX and RDI */ +.Lmismatchs: + tzcnt %r9d, %r9d # where is the mismatch? + add %rsi, %rdx # turn RDX from offset to pointer + movzbl (%rdx, %r9, 1), %eax + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax # difference of the mismatching chars + ret + +.Lnul_found2s: + sub $16, %rsi # roll back second increment + + /* a NUL has been found in RSI */ +.Lnul_founds: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX + xor $0xffff, %r9d # mask of mismatches + or %r8d, %r9d # NUL bytes also count as mismatches + jnz .Lmismatchs + + /* + * (RDI) == (RSI) and NUL is past the string. + * Compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rsi, %rax, 1), %xmm0 + pcmpeqb (%rsi, %rdi, 1), %xmm0 + add %rsi, %rdi # restore RDI pointer + add %rax, %rsi # point RSI to chunk corresponding to (RDI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + ret +ARCHEND(strcmp, baseline) .section .note.GNU-stack,"",%progbits -- 2.43.0 From 1207935d44e48795e955141f3ca432f7e0be53af Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 25 Sep 2023 02:18:06 -0400 Subject: [PATCH 02/30] share/man/man7/simd.7: document new amd64 baseline strcmp() Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D41971 --- share/man/man7/simd.7 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index 3fd8890c4f53..fc91ded85698 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd September 2, 2023 +.Dd September 25, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -68,7 +68,7 @@ Enhanced functions are present in the following architectures: .It strcat Ta Ta Ta S Ta S .It strchr Ta S Ta Ta S1 Ta S .It strchrnul Ta Ta Ta S1 -.It strcmp Ta Ta S Ta S Ta S +.It strcmp Ta Ta S Ta S1 Ta S .It strcpy Ta Ta Ta S1 Ta S Ta S2 .It strcspn Ta Ta Ta S2 .It strlen Ta Ta S Ta S1 -- 2.43.0 From 068d55849c6ff63cbecc5a2de96d8693bf296886 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 25 Sep 2023 17:43:12 -0400 Subject: [PATCH 03/30] lib/libc/amd64/string: implement strpbrk() through strcspn() This lets us use our optimised strcspn() routine for strpbrk() calls. Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D41980 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/strcspn.S | 18 +++++++------ lib/libc/amd64/string/strpbrk.c | 43 ++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 8 deletions(-) create mode 100644 lib/libc/amd64/string/strpbrk.c diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 09bf7c8f251e..50c70007e99b 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -14,6 +14,7 @@ MDSRCS+= \ strcspn.S \ strlen.S \ strnlen.c \ + strpbrk.c \ strspn.S \ timingsafe_bcmp.S \ timingsafe_memcmp.S diff --git a/lib/libc/amd64/string/strcspn.S b/lib/libc/amd64/string/strcspn.S index 53100eeea9a5..eab669edce72 100644 --- a/lib/libc/amd64/string/strcspn.S +++ b/lib/libc/amd64/string/strcspn.S @@ -33,13 +33,15 @@ #define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ -ARCHFUNCS(strcspn) - ARCHFUNC(strcspn, scalar) + .weak strcspn + .set strcspn, __strcspn +ARCHFUNCS(__strcspn) + ARCHFUNC(__strcspn, scalar) NOARCHFUNC - ARCHFUNC(strcspn, x86_64_v2) -ENDARCHFUNCS(strcspn) + ARCHFUNC(__strcspn, x86_64_v2) +ENDARCHFUNCS(__strcspn) -ARCHENTRY(strcspn, scalar) +ARCHENTRY(__strcspn, scalar) push %rbp # align stack to enable function call mov %rsp, %rbp sub $256, %rsp # allocate space for lookup table @@ -122,7 +124,7 @@ ARCHENTRY(strcspn, scalar) sub (%rsp), %rax # length of prefix before match leave ret -ARCHEND(strcspn, scalar) +ARCHEND(__strcspn, scalar) /* * This kernel uses pcmpistri to do the heavy lifting. @@ -134,7 +136,7 @@ ARCHEND(strcspn, scalar) * 17--32: two pcmpistri per 16 bytes of input * >=33: fall back to look up table */ -ARCHENTRY(strcspn, x86_64_v2) +ARCHENTRY(__strcspn, x86_64_v2) push %rbp mov %rsp, %rbp sub $256, %rsp @@ -368,6 +370,6 @@ ARCHENTRY(strcspn, x86_64_v2) 2: sub %rdi, %rax # number of characters preceding match leave ret -ARCHEND(strcspn, x86_64_v2) +ARCHEND(__strcspn, x86_64_v2) .section .note.GNU-stack,"",%progbits diff --git a/lib/libc/amd64/string/strpbrk.c b/lib/libc/amd64/string/strpbrk.c new file mode 100644 index 000000000000..87f587789991 --- /dev/null +++ b/lib/libc/amd64/string/strpbrk.c @@ -0,0 +1,43 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include + +size_t __strcspn(const char *, const char *); + +char * +strpbrk(const char *s, const char *charset) +{ + size_t loc; + + loc = __strcspn(s, charset); + + return (s[loc] == '\0' ? NULL : (char *)&s[loc]); +} -- 2.43.0 From ed40826389474ace68ac73906cc94b1acc93cd70 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 25 Sep 2023 17:45:26 -0400 Subject: [PATCH 04/30] share/man/man7/simd.7: document amd64 SIMD use for strpbrk() Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D41980 --- share/man/man7/simd.7 | 1 + 1 file changed, 1 insertion(+) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index fc91ded85698..cab48a01ead1 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -76,6 +76,7 @@ Enhanced functions are present in the following architectures: .It strncpy Ta Ta Ta Ta Ta S2 .It strnlen Ta Ta Ta S1 .It strrchr Ta S Ta Ta Ta S +.It strpbrk Ta Ta Ta S2 .It strspn Ta Ta Ta S2 .It swab Ta Ta Ta Ta S .It timingsafe_bcmp Ta Ta Ta S1 -- 2.43.0 From 5ca519940e49d508d68df890d1b98880f6c4619c Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Thu, 28 Sep 2023 16:40:20 -0400 Subject: [PATCH 05/30] lib/libc/tests/string: add unit tests for strncmp(3) These are patterned after the previously added (D41970) strcmp tests, but are extended to check for various length conditions. Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42122 --- lib/libc/tests/string/Makefile | 1 + lib/libc/tests/string/strncmp_test.c | 165 +++++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 lib/libc/tests/string/strncmp_test.c diff --git a/lib/libc/tests/string/Makefile b/lib/libc/tests/string/Makefile index a090e1bd3463..5874f7b6b873 100644 --- a/lib/libc/tests/string/Makefile +++ b/lib/libc/tests/string/Makefile @@ -11,6 +11,7 @@ ATF_TESTS_C+= flsl_test ATF_TESTS_C+= flsll_test ATF_TESTS_C+= memcmp_test ATF_TESTS_C+= memset_s_test +ATF_TESTS_C+= strncmp_test ATF_TESTS_C+= stpncpy_test ATF_TESTS_C+= strcmp2_test ATF_TESTS_C+= strcspn_test diff --git a/lib/libc/tests/string/strncmp_test.c b/lib/libc/tests/string/strncmp_test.c new file mode 100644 index 000000000000..989c58bcfedf --- /dev/null +++ b/lib/libc/tests/string/strncmp_test.c @@ -0,0 +1,165 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include +#include +#include + +int (*volatile strncmp_fn)(const char *, const char *, size_t); + +static void +alignment_testcase(char *a, char *b, int want, size_t len) +{ + int res; + + res = strncmp_fn(a, b, len); + ATF_CHECK_MSG(want == (res > 0) - (res < 0), + "strcmp(%p \"%s\", %p \"%s\", %zu) = %d != %d", + (void *)a, a, (void *)b, b, len, res, want); +} + +static void +check_strncmp_alignments(char a[], char b[], + size_t a_off, size_t b_off, size_t len, size_t pos) +{ + char *a_str, *b_str, a_orig, b_orig; + + a[a_off] = '\0'; + b[b_off] = '\0'; + + a_str = a + a_off + 1; + b_str = b + b_off + 1; + + a_str[len] = '\0'; + b_str[len] = '\0'; + a_str[len+1] = 'A'; + b_str[len+1] = 'B'; + + a_orig = a_str[pos]; + b_orig = b_str[pos]; + + alignment_testcase(a_str, b_str, 0, len + 16); + alignment_testcase(a_str, b_str, 0, len + 1); + alignment_testcase(a_str, b_str, 0, len); + + if (pos < len) { + a_str[pos] = '\0'; + alignment_testcase(a_str, b_str, -1, len + 16); + alignment_testcase(a_str, b_str, -1, len + 1); + alignment_testcase(a_str, b_str, -1, len); + alignment_testcase(a_str, b_str, -1, pos + 1); + alignment_testcase(a_str, b_str, 0, pos); + a_str[pos] = a_orig; + + b_str[pos] = '\0'; + alignment_testcase(a_str, b_str, 1, len + 16); + alignment_testcase(a_str, b_str, 1, len + 1); + alignment_testcase(a_str, b_str, 1, len); + alignment_testcase(a_str, b_str, 1, pos + 1); + alignment_testcase(a_str, b_str, 0, pos); + b_str[pos] = b_orig; + } + + a_str[pos] = 'X'; + alignment_testcase(a_str, b_str, 1, len + 16); + alignment_testcase(a_str, b_str, 0, pos); + alignment_testcase(a_str, b_str, 1, pos + 1); + if (pos < len) { + alignment_testcase(a_str, b_str, 1, len); + alignment_testcase(a_str, b_str, 1, len + 1); + } + a_str[pos] = a_orig; + + b_str[pos] = 'X'; + alignment_testcase(a_str, b_str, -1, len + 16); + alignment_testcase(a_str, b_str, 0, pos); + alignment_testcase(a_str, b_str, -1, pos + 1); + if (pos < len) { + alignment_testcase(a_str, b_str, -1, len); + alignment_testcase(a_str, b_str, -1, len + 1); + } + b_str[pos] = b_orig; + + a[a_off] = '-'; + b[b_off] = '-'; + a_str[len] = '-'; + b_str[len] = '-'; + a_str[len+1] = '-'; + b_str[len+1] = '-'; +} + +ATF_TC(strncmp_alignments); +ATF_TC_HEAD(strncmp_alignments, tc) +{ + atf_tc_set_md_var(tc, "descr", "Test strncmp(3) with various alignments"); +} + +ATF_TC_BODY(strncmp_alignments, tc) +{ + size_t a_off, b_off, len, pos; + char a[64+16+16+3], b[64+16+16+3]; + + memset(a, '-', sizeof(a)); + memset(b, '-', sizeof(b)); + a[sizeof(a) - 1] = '\0'; + b[sizeof(b) - 1] = '\0'; + + for (a_off = 0; a_off < 16; a_off++) + for (b_off = 0; b_off < 16; b_off++) + for (len = 1; len <= 64; len++) + for (pos = 0; pos <= len; pos++) + check_strncmp_alignments(a, b, a_off, b_off, len, pos); +} + +ATF_TC(strncmp_null); +ATF_TC_HEAD(strncmp_null, tc) +{ + atf_tc_set_md_var(tc, "descr", "Test strncmp(3) with null pointers"); +} + +ATF_TC_BODY(strncmp_null, tc) +{ + alignment_testcase(NULL, NULL, 0, 0); +} + +ATF_TP_ADD_TCS(tp) +{ + void *dl_handle; + + dl_handle = dlopen(NULL, RTLD_LAZY); + strncmp_fn = dlsym(dl_handle, "test_strncmp"); + if (strncmp_fn == NULL) + strncmp_fn = strncmp; + + ATF_TP_ADD_TC(tp, strncmp_alignments); + ATF_TP_ADD_TC(tp, strncmp_null); + + return atf_no_error(); +} -- 2.43.0 From 1e65999d033ec635740b51f85cc1a809806e9d4c Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Wed, 27 Sep 2023 18:46:04 -0400 Subject: [PATCH 06/30] lib/libc/amd64/string: add strncmp scalar, baseline implementation The scalar implementation is fairly straightforward and merely unrolled four times. The baseline implementation closely follows D41971 with appropriate extensions and extra code paths to pay attention to string length. Performance is quite good. We beat both glibc (except for very long strings, but they likely use AVX which we don't) and Bionic (except for medium-sized aligned strings, where we are still in the same ballpark). Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42122 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/strncmp.S | 488 +++++++++++++++++++++++++++++ 2 files changed, 489 insertions(+) create mode 100644 lib/libc/amd64/string/strncmp.S diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 50c70007e99b..51645ba3b8af 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -13,6 +13,7 @@ MDSRCS+= \ strcpy.c \ strcspn.S \ strlen.S \ + strncmp.S \ strnlen.c \ strpbrk.c \ strspn.S \ diff --git a/lib/libc/amd64/string/strncmp.S b/lib/libc/amd64/string/strncmp.S new file mode 100644 index 000000000000..932cf078bdfc --- /dev/null +++ b/lib/libc/amd64/string/strncmp.S @@ -0,0 +1,488 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(strncmp) + ARCHFUNC(strncmp, scalar) + ARCHFUNC(strncmp, baseline) +ENDARCHFUNCS(strncmp) + +/* + * This is just the scalar loop unrolled a bunch of times. + */ +ARCHENTRY(strncmp, scalar) + xor %eax, %eax + sub $4, %rdx # 4 chars left to compare? + jbe 1f + + ALIGN_TEXT +0: movzbl (%rdi), %ecx + test %ecx, %ecx # NUL char in first string? + jz .L0 + cmpb (%rsi), %cl # mismatch between strings? + jnz .L0 + + movzbl 1(%rdi), %ecx + test %ecx, %ecx + jz .L1 + cmpb 1(%rsi), %cl + jnz .L1 + + movzbl 2(%rdi), %ecx + test %ecx, %ecx + jz .L2 + cmpb 2(%rsi), %cl + jnz .L2 + + movzbl 3(%rdi), %ecx + test %ecx, %ecx + jz .L3 + cmpb 3(%rsi), %cl + jnz .L3 + + add $4, %rdi # advance to next iteration + add $4, %rsi + sub $4, %rdx + ja 0b + + /* end of string within the next 4 characters */ +1: cmp $-4, %edx # end of string reached immediately? + jz .Leq + movzbl (%rdi), %ecx + test %ecx, %ecx + jz .L0 + cmpb (%rsi), %cl + jnz .L0 + + cmp $-3, %edx # end of string reached after 1 char? + jz .Leq + movzbl 1(%rdi), %ecx + test %ecx, %ecx + jz .L1 + cmpb 1(%rsi), %cl + jnz .L1 + + cmp $-2, %edx + jz .Leq + movzbl 2(%rdi), %ecx + test %ecx, %ecx + jz .L2 + cmpb 2(%rsi), %cl + jnz .L2 + + cmp $-1, %edx # either end of string after 3 chars, + jz .Leq # or it boils down to the last char + +.L3: inc %eax +.L2: inc %eax +.L1: inc %eax +.L0: movzbl (%rsi, %rax, 1), %ecx + movzbl (%rdi, %rax, 1), %eax + sub %ecx, %eax +.Leq: ret +ARCHEND(strncmp, scalar) + +ARCHENTRY(strncmp, baseline) + push %rbx + sub $1, %rdx # RDX--, so RDX points to the last byte to compare + jb .Lempty # where there any bytes to compare at all? + + lea 15(%rdi), %r8d # end of head + lea 15(%rsi), %r9d + mov %edi, %eax + mov %esi, %ebx + xor %edi, %r8d # bits that changed between first and last byte + xor %esi, %r9d + and $~0xf, %rdi # align heads to 16 bytes + and $~0xf, %rsi + or %r8d, %r9d + and $0xf, %eax # offset from alignment + and $0xf, %ebx + movdqa (%rdi), %xmm0 # load aligned heads + movdqa (%rsi), %xmm2 + pxor %xmm1, %xmm1 + cmp $16, %rdx # end of buffer within the first 32 bytes? + jb .Llt16 + + test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + + /* heads may cross page boundary, avoid unmapped loads */ + movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -16(%rsp) + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %ebx, %ecx + shl %cl, %r9d # string head in XMM2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + test %r8d, %r10d # NUL byte present in first string? + lea -32(%rsp), %r8 + cmovz %rdi, %r8 + test %r9d, %r11d # NUL byte present in second string? + lea -16(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rbx, 1), %xmm4 + jmp 1f + + /* rdx == 0 */ +.Lempty: + xor %eax, %eax # zero-length buffers compare equal + pop %rbx + ret + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rbx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + xor $0xffff, %r9d # mismatch or NUL byte? + jnz .Lhead_mismatch + + /* load head and second chunk */ + movdqa 16(%rdi), %xmm2 # load second chunks + movdqa 16(%rsi), %xmm3 + lea -16(%rdx, %rbx, 1), %rdx # account for length of RSI chunk + sub %rbx, %rax # is a&0xf >= b&0xf? + jb .Lswapped # if not, proceed with swapped operands + jmp .Lnormal + + /* buffer ends within the first 16 bytes */ +.Llt16: test $PAGE_SIZE, %r9d # did the page change? + jz 0f # if not, take fast path + + /* heads may cross page boundary */ + movdqa %xmm0, -32(%rsp) # stash copies of the heads on the stack + movdqa %xmm2, -16(%rsp) + mov $-1, %r8d + mov $-1, %r9d + mov %eax, %ecx + shl %cl, %r8d # string head in XMM0 + mov %ebx, %ecx + shl %cl, %r9d # string head in XMM2 + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm1, %xmm2 + pmovmskb %xmm0, %r10d + pmovmskb %xmm2, %r11d + lea (%rdx, %rax, 1), %ecx # location of last buffer byte in xmm0 + bts %ecx, %r10d # treat as if NUL byte present + lea (%rdx, %rbx, 1), %ecx + bts %ecx, %r11d + test %r8w, %r10w # NUL byte present in first string head? + lea -32(%rsp), %r8 + cmovz %rdi, %r8 + test %r9w, %r11w # NUL byte present in second string head? + lea -16(%rsp), %r9 + cmovz %rsi, %r9 + movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads + movdqu (%r9, %rbx, 1), %xmm4 + jmp 1f + +0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads + movdqu (%rsi, %rbx, 1), %xmm4 +1: pxor %xmm2, %xmm2 + pcmpeqb %xmm0, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm4 # which bytes match? + pandn %xmm4, %xmm2 # match and not NUL byte? + pmovmskb %xmm2, %r9d + btr %edx, %r9d # induce mismatch in last byte of buffer + not %r9d # mismatch or NUL byte? + + /* mismatch in true heads */ + ALIGN_TEXT +.Lhead_mismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rax, %rdi # return to true heads + add %rbx, %rsi + movzbl (%rdi, %r9, 1), %eax # mismatching characters + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + + /* rax >= 0 */ + ALIGN_TEXT +.Lnormal: + neg %rax + movdqu 16(%rsi, %rax, 1), %xmm0 + sub %rdi, %rsi # express RSI as distance from RDI + lea (%rsi, %rax, 1), %rbx # point RBX to offset in second string + neg %rax # ... corresponding to RDI + pcmpeqb %xmm3, %xmm1 # NUL present? + pcmpeqb %xmm2, %xmm0 # Mismatch between chunks? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + mov $16, %ecx + cmp %rcx, %rdx # does the buffer end within (RDI,RSI,1)? + cmovb %edx, %ecx # ECX = min(16, RDX) + add $32, %rdi # advance to next iteration + bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte + test %r8w, %r8w # NUL or end of buffer found? + jnz .Lnul_found2 + xor $0xffff, %r9d + jnz .Lmismatch2 + sub $48, %rdx # end of buffer within first main loop iteration? + jb .Ltail # if yes, process tail + + /* + * During the main loop, the layout of the two strings is something like: + * + * v ------1------ v ------2------ v + * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... + * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... + * + * where v indicates the alignment boundaries and corresponding chunks + * of the strings have the same letters. Chunk A has been checked in + * the previous iteration. This iteration, we first check that string + * RSI doesn't end within region 2, then we compare chunk B between the + * two strings. As RSI is known not to hold a NUL byte in regsions 1 + * and 2 at this point, this also ensures that RDI has not ended yet. + */ + ALIGN_TEXT +0: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rdi + test %r8d, %r8d + jnz .Lnul_found2 + xor $0xffff, %r9d + jnz .Lmismatch2 + sub $32, %rdx # end of buffer within next iteration? + jae 0b + + /* end of buffer will occur in next 32 bytes */ +.Ltail: movdqu (%rdi, %rbx, 1), %xmm0 # chunk of 2nd string corresponding to RDI + pxor %xmm1, %xmm1 + pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? + pcmpeqb (%rdi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + bts %edx, %r8d # indicate NUL byte at last byte in buffer + test %r8w, %r8w # NUL byte in first chunk? + jnz .Lnul_found + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatch + + /* main loop unrolled twice */ + movdqu 16(%rdi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rdi, %rsi, 1), %xmm1 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + sub $16, %edx # take first half into account + bts %edx, %r8d # indicate NUL byte at last byte in buffer + add $32, %rdi + +.Lnul_found2: + sub $16, %rdi + +.Lnul_found: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8d # adjust NUL mask to positions in RDI/RBX + not %r9d # mask of mismatches + or %r8w, %r9w # NUL bytes als count as mismatches + jnz .Lmismatch + + /* + * (RDI) == (RSI) and NUL is past the string. + * compare (RSI) with the corresponding part + * of the other string until the NUL byte. + */ + movdqu (%rdi, %rax, 1), %xmm0 + pcmpeqb (%rdi, %rsi, 1), %xmm0 + add %rdi, %rsi # restore RSI pointer + add %rax, %rdi # point RDI to chunk corresponding to (RSI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + +.Lmismatch2: + sub $16, %rdi + + /* a mismatch has been found between RBX and RSI */ +.Lmismatch: + tzcnt %r9d, %r9d # where is the mismatch? + add %rdi, %rbx # turn RBX from offset into pointer + movzbl (%rbx, %r9, 1), %ecx + movzbl (%rdi, %r9, 1), %eax + sub %ecx, %eax + pop %rbx + ret + + /* rax < 0 */ + ALIGN_TEXT +.Lswapped: + movdqu 16(%rdi, %rax, 1), %xmm0 + sub %rsi, %rdi # express RDI as distance from RDI + lea (%rdi, %rax, 1), %rbx # point RBX to offset in first string + pcmpeqb %xmm2, %xmm1 # NUL present? + pcmpeqb %xmm3, %xmm0 # mismatch between chunks? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add %rax, %rdx # RDX points to buffer end in RSI + neg %rax # ... corresponding to RSI + mov $16, %ecx + cmp %rcx, %rdx # does the buffer end within (RSI,RDI,1)? + cmovb %edx, %ecx # ECX = min(16, RDX) + add $32, %rsi + bts %ecx, %r8d # mark end-of-buffer as if there was a NUL byte + test %r8w, %r8w # NUL or end of buffer found? + jnz .Lnul_found2s + xor $0xffff, %r9d + jnz .Lmismatch2s + sub $48, %rdx # end of buffer within first main loop iteration? + jb .Ltails # if yes, process tail + + ALIGN_TEXT +0: movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + test %r8d, %r8d + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + add $32, %rsi + test %r8d, %r8d + jnz .Lnul_found2s + xor $0xffff, %r9d + jnz .Lmismatch2s + sub $32, %rdx # end of buffer within next iteration? + jae 0b + + /* end of buffer will occur in next 32 bytes */ +.Ltails: + movdqu (%rsi, %rbx, 1), %xmm0 # chunk of 1st string corresponding to RSI + pxor %xmm1, %xmm1 + pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RDI? + pcmpeqb (%rsi), %xmm0 # where do the chunks match? + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + bts %edx, %r8d # indicate NUL byte at laste byte in buffer + test %r8w, %r8w # NUL byte in first chunk? + jnz .Lnul_founds + xor $0xffff, %r9d # any mismatches? + jnz .Lmismatchs + + /* main loop unrolled twice */ + movdqu 16(%rsi, %rbx, 1), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi, %rdi, 1), %xmm1 + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm1, %r8d + pmovmskb %xmm0, %r9d + sub $16, %edx # take first half into account + bts %edx, %r8d # indicate NUL byte at laste byte in buffer + add $32, %rsi + +.Lnul_found2s: + sub $16, %rsi + +.Lnul_founds: + mov %eax, %ecx + mov %r8d, %r10d + shl %cl, %r8d # adjust NUL mask to positions in RSI/RBX + not %r9d # mask of mismatches + or %r8w, %r9w # NUL bytes also count as mismatches + jnz .Lmismatchs + + movdqu (%rsi, %rax, 1), %xmm0 + pcmpeqb (%rsi, %rdi, 1), %xmm0 + add %rsi, %rdi # restore RDI pointer + add %rax, %rsi # point RSI to chunk corresponding to (RDI) + pmovmskb %xmm0, %ecx # mask of matches + not %ecx # mask of mismatches + or %r10d, %ecx # mask of mismatches or NUL bytes + tzcnt %ecx, %ecx # location of first mismatch + movzbl (%rdi, %rcx, 1), %eax + movzbl (%rsi, %rcx, 1), %ecx + sub %ecx, %eax + pop %rbx + ret + +.Lmismatch2s: + sub $16, %rsi + +.Lmismatchs: + tzcnt %r9d, %r9d # where is the mismatch? + add %rsi, %rbx # turn RBX from offset into pointer + movzbl (%rbx, %r9, 1), %eax + movzbl (%rsi, %r9, 1), %ecx + sub %ecx, %eax + pop %rbx + ret +ARCHEND(strncmp, baseline) + + .section .note.GNU-stack,"",%progbits -- 2.43.0 From 8aaab7a50959942f4bbc220a14b33a82be880aac Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Sat, 7 Oct 2023 22:46:36 -0400 Subject: [PATCH 07/30] share/man/man7/simd.7: document strncmp amd64 scalar, baseline implementations Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42122 --- share/man/man7/simd.7 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index cab48a01ead1..39642429c1c4 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd September 25, 2023 +.Dd October 7, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -72,7 +72,7 @@ Enhanced functions are present in the following architectures: .It strcpy Ta Ta Ta S1 Ta S Ta S2 .It strcspn Ta Ta Ta S2 .It strlen Ta Ta S Ta S1 -.It strncmp Ta Ta S Ta Ta S +.It strncmp Ta Ta S Ta S1 Ta S .It strncpy Ta Ta Ta Ta Ta S2 .It strnlen Ta Ta Ta S1 .It strrchr Ta S Ta Ta Ta S -- 2.43.0 From 8e751e625616a594f827d43f2209ffb85b6156ea Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Thu, 12 Oct 2023 01:37:41 -0400 Subject: [PATCH 08/30] lib/libc/amd64/string: add strrchr scalar, baseline implementation The baseline implementation is very straightforward, while the scalar implementation suffers from register pressure and the need to use SWAR techniques similar to those used for strchr(). Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42217 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/strrchr.S | 209 +++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 lib/libc/amd64/string/strrchr.S diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 51645ba3b8af..2baa631fb3fa 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -16,6 +16,7 @@ MDSRCS+= \ strncmp.S \ strnlen.c \ strpbrk.c \ + strrchr.S \ strspn.S \ timingsafe_bcmp.S \ timingsafe_memcmp.S diff --git a/lib/libc/amd64/string/strrchr.S b/lib/libc/amd64/string/strrchr.S new file mode 100644 index 000000000000..e397bbcd3478 --- /dev/null +++ b/lib/libc/amd64/string/strrchr.S @@ -0,0 +1,209 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4,0x90 # 16-byte alignment, nop-filled + + .weak rindex + .set rindex, strrchr + +ARCHFUNCS(strrchr) + ARCHFUNC(strrchr, scalar) + ARCHFUNC(strrchr, baseline) +ENDARCHFUNCS(strrchr) + +ARCHENTRY(strrchr, scalar) + mov %edi, %ecx + and $~7, %rdi # align to 8 byte + movzbl %sil, %esi # clear stray high bits + movabs $0x0101010101010101, %r8 + mov (%rdi), %rax # load first word + imul %r8, %rsi # replicate char 8 times + + /* + * Unaligned input: align to 8 bytes. Then proceed the same + * way as with aligned input, but prevent matches before the + * beginning of the string. This is achieved by oring 0x01 + * into each byte of the buffer before the string + */ + shl $3, %ecx + mov %r8, %r10 + shl %cl, %r10 # 0x01 where the string is + xor %r8, %r10 # 0x01 where it is not + neg %r8 # negate 01..01 so we can use lea + movabs $0x8080808080808080, %r9 + + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + or %r10, %rax # ensure str != 0 before string + or %r10, %rcx # ensure str^c != 0 before string + bswap %rcx # in reverse order, to find last match + mov %rdi, %r10 # location of initial mismatch (if any) + xor %r11, %r11 # initial mismatch (none) + add $8, %rdi # advance to next iteration + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jnz 1f # end of string? + + lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + mov %rcx, %r11 # remember mismatch in head + jmp 0f + + /* main loop unrolled twice */ + ALIGN_TEXT +3: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + lea -8(%rdi), %rdx + cmovnz %rdx, %r10 # remember location of current mismatch + cmovnz %rcx, %r11 + +0: mov (%rdi), %rax # str + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + bswap %rcx # in reverse order, to find last match + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jnz 2f # end of string? + + lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + cmovnz %rdi, %r10 # remember location of current mismatch + cmovnz %rcx, %r11 + + mov 8(%rdi), %rax # str + add $16, %rdi + mov %rsi, %rcx + xor %rax, %rcx # str ^ c + bswap %rcx + lea (%rax, %r8, 1), %rdx # str - 0x01..01 + not %rax # ~str + and %rdx, %rax # (str - 0x01..01) & ~str + and %r9, %rax # not including junk bits + jz 3b # end of string? + + /* NUL found */ +1: sub $8, %rdi # undo advance past buffer +2: lea (%rcx, %r8, 1), %rdx # (str ^ c) - 0x01..01 + not %rcx # ~(str ^ c) + and %rdx, %rcx # ((str ^ c - 0x01..01) & ~(str ^ c) + and %r9, %rcx # not including junk bits + lea -1(%rax), %rdx + xor %rdx, %rax # mask of bytes in the string + bswap %rdx # in reverse order + and %rdx, %rcx # c found in the tail? + cmovnz %rdi, %r10 + cmovnz %rcx, %r11 + bswap %r11 # unreverse byte order + bsr %r11, %rcx # last location of c in (R10) + shr $3, %rcx # as byte offset + lea (%r10, %rcx, 1), %rax # pointer to match + test %r11, %r11 # was there actually a match? + cmovz %r11, %rax # if not, return null pointer + ret +ARCHEND(strrchr, scalar) + +ARCHENTRY(strrchr, baseline) + mov %edi, %ecx + and $~0xf, %rdi # align to 16 bytes + movdqa (%rdi), %xmm1 + movd %esi, %xmm0 + and $0xf, %ecx # offset from alignment + pxor %xmm2, %xmm2 + mov $-1, %edx + punpcklbw %xmm0, %xmm0 # c -> cc + shl %cl, %edx # bits corresponding to bytes in the string + punpcklwd %xmm0, %xmm0 # cc -> cccc + xor %r8, %r8 # address of latest match + mov $1, %esi # bit mask of latest match + mov %rdi, %r9 # candidate location for next match + add $16, %rdi # advance to next chunk + + /* check for match in head */ + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pshufd $0, %xmm0, %xmm0 # cccc -> cccccccccccccccc + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + and %edx, %ecx # c present in the string? + and %edx, %eax # NUL present in the string? + jnz .Lend2 + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rdi), %xmm1 + test %ecx, %ecx # was there a match in the last iter.? + cmovnz %r9, %r8 # remember match if any + cmovnz %ecx, %esi + pxor %xmm2, %xmm2 + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + test %eax, %eax # end of string in first half? + jnz .Lend + + movdqa 16(%rdi), %xmm1 + test %ecx, %ecx # was there a match in the last iter.? + cmovnz %rdi, %r8 # remember match if any + cmovnz %ecx, %esi + pxor %xmm2, %xmm2 + pcmpeqb %xmm1, %xmm2 # NUL byte present? + pcmpeqb %xmm0, %xmm1 # c present? + pmovmskb %xmm2, %eax + pmovmskb %xmm1, %ecx + lea 16(%rdi), %r9 + add $32, %rdi + test %eax, %eax # end of string in second half? + jz 0b + + ALIGN_TEXT +.Lend2: sub $16, %rdi +.Lend: lea -1(%rax), %edx + xor %edx, %eax # mask of bytes in the string + and %eax, %ecx # c found in the tail? + cmovnz %rdi, %r8 + cmovnz %ecx, %esi + bsr %esi, %esi # last location of c in (R8) + lea (%r8, %rsi, 1), %rax # pointer to match + ret +ARCHEND(strrchr, baseline) + .section .note.GNU-stack,"",%progbits -- 2.43.0 From e2ceb04a1ab74b0897605c840c508ae5aa679e6a Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Thu, 12 Oct 2023 18:31:55 -0400 Subject: [PATCH 09/30] share/man/man7/simd.7: document strrchr scalar, baseline implementation Also mention missing rindex() entry, which is provided through strrchr(). Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42217 --- share/man/man7/simd.7 | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index 39642429c1c4..4e5c078a6e95 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd October 7, 2023 +.Dd October 12, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -63,7 +63,7 @@ Enhanced functions are present in the following architectures: .It memcpy Ta S Ta S Ta S Ta S Ta SV .It memmove Ta S Ta S Ta S Ta S Ta SV .It memset Ta Ta S Ta S Ta S -.It rindex Ta S +.It rindex Ta S Ta Ta S1 Ta S .It stpcpy Ta Ta Ta S1 .It strcat Ta Ta Ta S Ta S .It strchr Ta S Ta Ta S1 Ta S @@ -75,7 +75,7 @@ Enhanced functions are present in the following architectures: .It strncmp Ta Ta S Ta S1 Ta S .It strncpy Ta Ta Ta Ta Ta S2 .It strnlen Ta Ta Ta S1 -.It strrchr Ta S Ta Ta Ta S +.It strrchr Ta S Ta Ta S1 Ta S .It strpbrk Ta Ta Ta S2 .It strspn Ta Ta Ta S2 .It swab Ta Ta Ta Ta S -- 2.43.0 From 9b0e31a48bc479d92b4f9cf149562fad4035d8a4 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 23 Oct 2023 23:52:01 -0400 Subject: [PATCH 10/30] lib/libc/amd64/string: implement strsep() through strcspn() The strsep() function is basically strcspn() with extra steps. On amd64, we now have an optimised implementation of strcspn(), so instead of implementing the inner loop manually, just call into the optimised routine. Sponsored by: The FreeBSD Foundation. Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42346 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/strsep.c | 57 ++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 lib/libc/amd64/string/strsep.c diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 2baa631fb3fa..ee396f98eccc 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -17,6 +17,7 @@ MDSRCS+= \ strnlen.c \ strpbrk.c \ strrchr.S \ + strsep.c \ strspn.S \ timingsafe_bcmp.S \ timingsafe_memcmp.S diff --git a/lib/libc/amd64/string/strsep.c b/lib/libc/amd64/string/strsep.c new file mode 100644 index 000000000000..9fda56d7e135 --- /dev/null +++ b/lib/libc/amd64/string/strsep.c @@ -0,0 +1,57 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include +#include + +size_t __strcspn(const char *, const char *); + +/* + * We have a fast strcspn() on amd64. Use it over a direct + * implementation of strsep for better performance. + */ +char * +strsep(char **stringp, const char *delim) +{ + size_t n; + char *s; + + s = *stringp; + if (s == NULL) + return (NULL); + + n = __strcspn(s, delim); + if (s[n] == '\0') + *stringp = NULL; + else { + s[n] = '\0'; + *stringp = s + n + 1; + } + + return (s); +} -- 2.43.0 From 878c169933e93ee39b5f0fabd2254d07b400740c Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 23 Oct 2023 23:57:56 -0400 Subject: [PATCH 11/30] share/man/man7/simd.7: document amd64 SIMD use for strsep() Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42346 --- share/man/man7/simd.7 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index 4e5c078a6e95..4437d025394f 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd October 12, 2023 +.Dd October 23, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -77,6 +77,7 @@ Enhanced functions are present in the following architectures: .It strnlen Ta Ta Ta S1 .It strrchr Ta S Ta Ta S1 Ta S .It strpbrk Ta Ta Ta S2 +.It strsep Ta Ta Ta S2 .It strspn Ta Ta Ta S2 .It swab Ta Ta Ta Ta S .It timingsafe_bcmp Ta Ta Ta S1 -- 2.43.0 From 60a63af2c6cf6c8c9a995cc181fb83196ba360bc Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Sun, 5 Nov 2023 00:02:00 -0400 Subject: [PATCH 12/30] lib/libc/tests/string/stpncpy_test.c: extend for upcoming SSE implementation This adds additional unit tests validating the function for All possible alignment offsets of source and destination. Also extend the test to allow testing of an external stpncpy implementation, which greatly simplifies the development of custom implementations. Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42519 --- lib/libc/tests/string/stpncpy_test.c | 99 ++++++++++++++++++++++++---- 1 file changed, 85 insertions(+), 14 deletions(-) diff --git a/lib/libc/tests/string/stpncpy_test.c b/lib/libc/tests/string/stpncpy_test.c index 8154237eb8c2..8574b2d591be 100644 --- a/lib/libc/tests/string/stpncpy_test.c +++ b/lib/libc/tests/string/stpncpy_test.c @@ -1,7 +1,11 @@ /*- * Copyright (c) 2009 David Schultz + * Copyright (c) 2023 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -27,12 +31,15 @@ #include #include #include +#include #include #include #include #include +static char *(*stpncpy_fn)(char *restrict, const char *restrict, size_t); + static char * makebuf(size_t len, int guard_at_end) { @@ -69,7 +76,7 @@ test_stpncpy(const char *s) dst = makebuf(bufsize, j); memset(dst, 'X', bufsize); len = (bufsize < size) ? bufsize : size - 1; - assert(stpncpy(dst, src, bufsize) == dst+len); + assert(stpncpy_fn(dst, src, bufsize) == dst+len); assert(memcmp(src, dst, len) == 0); for (x = len; x < bufsize; x++) assert(dst[x] == '\0'); @@ -78,33 +85,97 @@ test_stpncpy(const char *s) } } -ATF_TC_WITHOUT_HEAD(nul); -ATF_TC_BODY(nul, tc) +static void +test_sentinel(char *dest, char *src, size_t destlen, size_t srclen) { + size_t i; + const char *res, *wantres; + const char *fail = NULL; + + for (i = 0; i < srclen; i++) + /* src will never include (){} */ + src[i] = '0' + i; + src[srclen] = '\0'; + + /* source sentinels: not to be copied */ + src[-1] = '('; + src[srclen+1] = ')'; + + memset(dest, 0xee, destlen); + + /* destination sentinels: not to be touched */ + dest[-1] = '{'; + dest[destlen] = '}'; + + wantres = dest + (srclen > destlen ? destlen : srclen); + res = stpncpy_fn(dest, src, destlen); + + if (dest[-1] != '{') + fail = "start sentinel overwritten"; + else if (dest[destlen] != '}') + fail = "end sentinel overwritten"; + else if (strncmp(src, dest, destlen) != 0) + fail = "string not copied correctly"; + else if (res != wantres) + fail = "incorrect return value"; + else for (i = srclen; i < destlen; i++) + if (dest[i] != '\0') { + fail = "incomplete NUL padding"; + break; + } - test_stpncpy(""); + if (fail) + atf_tc_fail_nonfatal("%s\n" + "stpncpy(%p \"%s\", %p \"%s\", %zu) = %p (want %p)\n", + fail, dest, dest, src, src, destlen, res, wantres); } -ATF_TC_WITHOUT_HEAD(foo); -ATF_TC_BODY(foo, tc) +ATF_TC_WITHOUT_HEAD(null); +ATF_TC_BODY(null, tc) { - - test_stpncpy("foo"); + ATF_CHECK_EQ(stpncpy_fn(NULL, NULL, 0), NULL); } -ATF_TC_WITHOUT_HEAD(glorp); -ATF_TC_BODY(glorp, tc) +ATF_TC_WITHOUT_HEAD(bounds); +ATF_TC_BODY(bounds, tc) { + size_t i; + char buf[64+1]; - test_stpncpy("glorp"); + for (i = 0; i < sizeof(buf) - 1; i++) { + buf[i] = ' ' + i; + buf[i+1] = '\0'; + test_stpncpy(buf); + } +} + +ATF_TC_WITHOUT_HEAD(alignments); +ATF_TC_BODY(alignments, tc) +{ + size_t srcalign, destalign, srclen, destlen; + char src[15+3+64]; /* 15 offsets + 64 max length + NUL + sentinels */ + char dest[15+2+64]; /* 15 offsets + 64 max length + sentinels */ + + for (srcalign = 0; srcalign < 16; srcalign++) + for (destalign = 0; destalign < 16; destalign++) + for (srclen = 0; srclen < 64; srclen++) + for (destlen = 0; destlen < 64; destlen++) + test_sentinel(dest+destalign+1, + src+srcalign+1, destlen, srclen); } ATF_TP_ADD_TCS(tp) { + void *dl_handle; + + dl_handle = dlopen(NULL, RTLD_LAZY); + stpncpy_fn = dlsym(dl_handle, "test_stpncpy"); + if (stpncpy_fn == NULL) + stpncpy_fn = stpncpy; - ATF_TP_ADD_TC(tp, nul); - ATF_TP_ADD_TC(tp, foo); - ATF_TP_ADD_TC(tp, glorp); + ATF_TP_ADD_TC(tp, null); + ATF_TP_ADD_TC(tp, bounds); + ATF_TP_ADD_TC(tp, alignments); return (atf_no_error()); } -- 2.43.0 From 39c08d47105d2cae1545d237403afcdee40eae5f Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Sun, 29 Oct 2023 23:15:46 -0400 Subject: [PATCH 13/30] lib/libc/amd64/string: add stpncpy scalar, baseline implementation This was surprisingly annoying to get right, despite being such a simple function. A scalar implementation is also provided, it just calls into our optimised memchr(), memcpy(), and memset() routines to carry out its job. I'm quite happy with the performance. glibc only beats us for very long strings, likely due to the use of AVX-512. The scalar implementation just calls into our optimised memchr(), memcpy(), and memset() routines, so it has a high overhead to begin with but then performs ok for the amount of effort that went into it. Still beats the old C code, except for very short strings. Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42519 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/stpncpy.S | 283 +++++++++++++++++++++++++++++ 2 files changed, 284 insertions(+) create mode 100644 lib/libc/amd64/string/stpncpy.S diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index ee396f98eccc..cc8b0e825e3e 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -7,6 +7,7 @@ MDSRCS+= \ memmove.S \ memset.S \ stpcpy.S \ + stpncpy.S \ strcat.S \ strchrnul.S \ strcmp.S \ diff --git a/lib/libc/amd64/string/stpncpy.S b/lib/libc/amd64/string/stpncpy.S new file mode 100644 index 000000000000..5ce0dd093a9e --- /dev/null +++ b/lib/libc/amd64/string/stpncpy.S @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak stpncpy + .set stpncpy, __stpncpy +ARCHFUNCS(__stpncpy) + ARCHFUNC(__stpncpy, scalar) + ARCHFUNC(__stpncpy, baseline) +ENDARCHFUNCS(__stpncpy) + +ARCHENTRY(__stpncpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + + push %rdx + push %rdi + push %rsi + push %rax # dummy push for alignment + + mov %rsi, %rdi + xor %esi, %esi + call CNAME(__memchr) # memchr(src, '\0', len) + pop %rcx # dummy pop + pop %rsi + mov -16(%rbp), %rdi + + test %rax, %rax # NUL found? + jz .Lfullcopy + + mov %rax, %rdx + sub %rsi, %rdx # copy until the NUL byte + add %rdx, -16(%rbp) # advance destination by string length + sub %rdx, -8(%rbp) # and shorten buffer size by string length + call CNAME(memcpy) + + pop %rdi + pop %rdx + xor %esi, %esi + pop %rbp + jmp CNAME(memset) # clear remaining buffer + +.Lfullcopy: + mov -8(%rbp), %rdx + call CNAME(memcpy) # copy whole string + add -8(%rbp), %rax # point to dest[n] + leave + ret +ARCHEND(__stpncpy, scalar) + + /* + * this mask allows us to generate masks of 16-n 0xff bytes + * followed by n 0x00 bytes by loading from .Lmask+n. + */ + .section .rodata +.Lmask: .quad 0xffffffffffffffff + .quad 0xffffffffffffffff + .quad 0x0000000000000000 + .quad 0x0000000000000000 + +/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */ +ARCHENTRY(__stpncpy, baseline) +#define bounce (-3*16-8) /* location of on-stack bounce buffer */ + + test %rdx, %rdx # no bytes to copy? + jz .L0 + + mov %esi, %ecx + and $~0xf, %rsi # align source to 16 bytes + movdqa (%rsi), %xmm0 # load head + and $0xf, %ecx # offset from alignment + mov $-1, %r9d + lea -32(%rcx), %rax # set up overflow-proof comparison rdx+rcx<=32 + shl %cl, %r9d # mask of bytes belonging to the string + sub %rcx, %rdi # adjust RDI to correspond to RSI + pxor %xmm1, %xmm1 + movdqa %xmm0, bounce(%rsp) # stash copy of head on the stack + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %r8d + + lea (%rdx, %rcx, 1), %r10 # buffer length from alignment boundary + add %rdx, %rax # less than 2 chunks (32 bytes) to play with? + jnc .Lrunt # if yes, use special runt processing + + movdqu %xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination + and %r9d, %r8d # end of string within head? + jnz .Lheadnul + + movdqu (%rsi, %rcx, 1), %xmm2 # load head from source buffer + movdqu %xmm2, (%rdi, %rcx, 1) # an deposit + + add $16, %rsi + add $16, %rdi + sub $32, %r10 + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa (%rsi), %xmm0 + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 3f + + movdqu %xmm0, (%rdi) + cmp $16, %r10 # more than a full chunk left? + jbe 1f + + movdqa 16(%rsi), %xmm0 + add $32, %rdi # advance pointers to next chunk + add $32, %rsi + pxor %xmm1, %xmm1 + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %r8d + test %r8d, %r8d + jnz 2f + + movdqu %xmm0, -16(%rdi) + sub $32, %r10 # more than another full chunk left? + ja 0b + + sub $16, %rdi # undo second advancement + sub $16, %rsi + add $16, %r10d # restore number of remaining bytes + + /* 1--16 bytes left but string has not ended yet */ +1: pxor %xmm1, %xmm1 + pcmpeqb 16(%rsi), %xmm1 # NUL byte in source tail? + pmovmskb %xmm1, %r8d + bts %r10d, %r8d # treat end of buffer as NUL + tzcnt %r8d, %r8d # where is the NUL byte? + movdqu (%rsi, %r8, 1), %xmm0 # load source tail before NUL + lea 16(%rdi, %r8, 1), %rax # point return value to NUL byte + # or end of buffer + movdqu %xmm0, (%rdi, %r8, 1) # store tail into the buffer + ret + +2: sub $16, %rdi # undo second advancement + sub $16, %rsi + sub $16, %r10 + + /* string has ended and buffer has not */ +3: tzcnt %r8d, %r8d # where did the string end? + lea .Lmask+16(%rip), %rcx + lea (%rdi, %r8, 1), %rax # where the NUL byte will be + neg %r8 + movdqu (%rcx, %r8, 1), %xmm1 # mask with FF where the string is, + # 00 where it is not + pand %xmm1, %xmm0 # mask out bytes after the string + movdqu %xmm0, (%rdi) # store masked current chunk + pxor %xmm1, %xmm1 + sub $16, %r10 # another full chunk left? + jbe 1f + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, 16(%rdi) + cmp $16, %r10 + jbe 1f + + movdqu %xmm1, 32(%rdi) + add $32, %rdi + sub $32, %r10 + ja 0b + +1: ret + + /* at least two chunks to play with and NUL while processing head */ +.Lheadnul: + movdqu bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack + tzcnt %r8d, %r8d # find location of NUL byte + movdqu %xmm0, (%rdi, %rcx, 1) # deposit head in the destination + movdqu %xmm1, (%rdi, %r8, 1) # clear out following bytes + movdqu %xmm1, 16(%rdi) # clear out second chunk + lea (%rdi, %r8, 1), %rax # make RAX point to the NUL byte + + add $32, %rdi # advance past first two chunks + sub $32+16, %r10 # advance past first three chunks + jbe 1f # did we pass the end of the buffer? + + /* clear remaining destination buffer (tail has been cleared earlier) */ + ALIGN_TEXT +0: movdqu %xmm1, (%rdi) # clear out buffer chunk + cmp $16, %r10 + jbe 1f + + movdqu %xmm1, 16(%rdi) + add $32, %rdi + sub $32, %r10 + ja 0b + +1: ret + + /* 1--32 bytes to copy, bounce through the stack */ +.Lrunt: movdqa %xmm1, bounce+16(%rsp) # clear out rest of on-stack copy + bts %r10d, %r8d # treat end of buffer as end of string + and %r9w, %r8w # end of string within first buffer? + jnz 0f # if yes, do not inspect second buffer + + movdqa 16(%rsi), %xmm0 # load second chunk of input + movdqa %xmm0, bounce+16(%rsp) # stash copy on stack + pcmpeqb %xmm1, %xmm0 # NUL in second chunk? + pmovmskb %xmm0, %r9d + shl $16, %r9d + or %r9d, %r8d # merge found NUL bytes into NUL mask + + /* end of string after one buffer */ +0: tzcnt %r8d, %r8d # location of last char in string + movdqu %xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string + lea bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack + lea (%rdi, %r8, 1), %rax # return pointer to NUL byte + + cmp $16, %edx # at least 16 bytes to transfer? + jae .L1631 + + mov (%rsi), %r8 # load string head + cmp $8, %edx # at least 8 bytes to transfer? + jae .L0815 + + cmp $4, %edx # at least 4 bytes to transfer? + jae .L0407 + + movzwl -2(%rsi, %rdx, 1), %esi # load last two bytes of string + mov %r8b, (%rdi, %rcx, 1) # store first byte + + cmp $2, %edx # at least 2 bytes to transfer? + jb .L1 + + mov %si, -2(%rdi, %r10, 1) # store last two bytes of string +.L1: ret + +.L1631: movdqu (%rsi), %xmm0 # load first 16 bytes of string + movdqu -16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string + movdqu %xmm0, (%rdi, %rcx, 1) + movdqu %xmm1, -16(%rdi, %r10, 1) + ret + +.L0815: mov -8(%rsi, %rdx, 1), %rdx # load last 8 bytes of string + mov %r8, (%rdi, %rcx, 1) + mov %rdx, -8(%rdi, %r10, 1) + ret + +.L0407: mov -4(%rsi, %rdx, 1), %edx # load last four bytes of string + mov %r8d, (%rdi, %rcx, 1) + mov %edx, -4(%rdi, %r10, 1) + ret + + /* length 0 buffer: just return dest */ +.L0: mov %rdi, %rax + ret +ARCHEND(__stpncpy, baseline) + + .section .note.GNU-stack,"",%progbits -- 2.43.0 From ec3f518dac024f1fd76b09a457f6630798b80107 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Wed, 8 Nov 2023 23:25:55 -0500 Subject: [PATCH 14/30] lib/libc/amd64/string: implement strncpy() by calling stpncpy() Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42519 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/strncpy.c | 41 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100644 lib/libc/amd64/string/strncpy.c diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index cc8b0e825e3e..d982061e080b 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -15,6 +15,7 @@ MDSRCS+= \ strcspn.S \ strlen.S \ strncmp.S \ + strncpy.c \ strnlen.c \ strpbrk.c \ strrchr.S \ diff --git a/lib/libc/amd64/string/strncpy.c b/lib/libc/amd64/string/strncpy.c new file mode 100644 index 000000000000..b3d868787fbe --- /dev/null +++ b/lib/libc/amd64/string/strncpy.c @@ -0,0 +1,41 @@ +/*- + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include +#include + +char *__stpncpy(char *restrict, const char *restrict, size_t); + +char * +strncpy(char *restrict dst, const char *restrict src, size_t len) +{ + + __stpncpy(dst, src, len); + + return (dst); +} -- 2.43.0 From de2d890440095993e96fab8affbf607f6b14e62d Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Wed, 8 Nov 2023 23:39:11 -0500 Subject: [PATCH 15/30] share/man/man7/simd.7: document simd-enhanced strncpy, stpncpy Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42519 --- share/man/man7/simd.7 | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index 4437d025394f..f3818c530af5 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd October 23, 2023 +.Dd November 8, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -65,6 +65,7 @@ Enhanced functions are present in the following architectures: .It memset Ta Ta S Ta S Ta S .It rindex Ta S Ta Ta S1 Ta S .It stpcpy Ta Ta Ta S1 +.It stpncpy Ta Ta Ta S1 .It strcat Ta Ta Ta S Ta S .It strchr Ta S Ta Ta S1 Ta S .It strchrnul Ta Ta Ta S1 @@ -73,7 +74,7 @@ Enhanced functions are present in the following architectures: .It strcspn Ta Ta Ta S2 .It strlen Ta Ta S Ta S1 .It strncmp Ta Ta S Ta S1 Ta S -.It strncpy Ta Ta Ta Ta Ta S2 +.It strncpy Ta Ta Ta S1 Ta Ta S2 .It strnlen Ta Ta Ta S1 .It strrchr Ta S Ta Ta S1 Ta S .It strpbrk Ta Ta Ta S2 @@ -209,7 +210,7 @@ SIMD-enhanced functions were first added with for .Cm powerpc64 and with -.Fx 14.0 +.Fx 14.1 for .Cm amd64 . .Pp -- 2.43.0 From 5710adcb809b0297b60dd86874f377d18301ed05 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Tue, 14 Nov 2023 13:09:08 -0500 Subject: [PATCH 16/30] lib/libc/amd64/string/strcat.S: enable use of SIMD strcat has a bespoke scalar assembly implementation we inherited from NetBSD. While it performs well, it is better to call into our SIMD implementations if any SIMD features are available at all. So do that and implement strcat() by calling into strlen() and strcpy() if these are available. Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Reviison: https://reviews.freebsd.org/D42600 --- lib/libc/amd64/string/strcat.S | 47 ++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/lib/libc/amd64/string/strcat.S b/lib/libc/amd64/string/strcat.S index 0834408acfb7..081e98840cee 100644 --- a/lib/libc/amd64/string/strcat.S +++ b/lib/libc/amd64/string/strcat.S @@ -1,6 +1,14 @@ -/* - * Written by J.T. Conklin - * Public domain. +/*- + * Copyright (c) 2023, The FreeBSD Foundation + * + * SPDX-License-Expression: BSD-2-Clause + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcat.S + * written by J.T. Conklin + * that was originally dedicated to the public domain */ #include @@ -8,7 +16,14 @@ RCSID("$NetBSD: strcat.S,v 1.4 2004/07/26 18:51:21 drochner Exp $") #endif -ENTRY(strcat) +#include "amd64_archlevel.h" + +ARCHFUNCS(strcat) + ARCHFUNC(strcat, scalar) + ARCHFUNC(strcat, baseline) +ENDARCHFUNCS(strcat) + +ARCHENTRY(strcat, scalar) movq %rdi,%rax movabsq $0x0101010101010101,%r8 movabsq $0x8080808080808080,%r9 @@ -161,6 +176,28 @@ ENTRY(strcat) .Ldone: ret -END(strcat) +ARCHEND(strcat, scalar) + +/* + * Call into strlen + strcpy if we have any SIMD at all. + * The scalar implementation above is better for the scalar + * case as it avoids the function call overhead, but pessimal + * if we could call SIMD routines instead. + */ +ARCHENTRY(strcat, baseline) + push %rbp + mov %rsp, %rbp + push %rsi + push %rbx + mov %rdi, %rbx # remember destination for later + call CNAME(strlen) # strlen(dest) + mov -8(%rbp), %rsi + lea (%rbx, %rax, 1), %rdi # dest + strlen(dest) + call CNAME(__stpcpy) # stpcpy(dest + strlen(dest), src) + mov %rbx, %rax # return dest + pop %rbx + leave + ret +ARCHEND(strcat, baseline) .section .note.GNU-stack,"",%progbits -- 2.43.0 From 72706c35afcee02ae63252a09868f9ad099ac90e Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Tue, 14 Nov 2023 13:26:21 -0500 Subject: [PATCH 17/30] share/man/man7/simd.7: document SIMD-enhanced strcat Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42600 --- share/man/man7/simd.7 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index f3818c530af5..6ddf769b6d38 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd November 8, 2023 +.Dd November 14, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -48,7 +48,7 @@ the environment variable .Ev ARCHLEVEL can be used to override this mechanism. .Pp -Enhanced functions are present in the following architectures: +Enhanced functions are present for the following architectures: .Bl -column FUNCTION_________ aarch64_ arm_ amd64_ i386_ ppc64_ -offset indent .It Em FUNCTION Ta Em AARCH64 Ta Em ARM Ta Em AMD64 Ta Em I386 Ta Em PPC64 .It bcmp Ta Ta Ta S1 Ta S @@ -66,7 +66,7 @@ Enhanced functions are present in the following architectures: .It rindex Ta S Ta Ta S1 Ta S .It stpcpy Ta Ta Ta S1 .It stpncpy Ta Ta Ta S1 -.It strcat Ta Ta Ta S Ta S +.It strcat Ta Ta Ta S1 Ta S .It strchr Ta S Ta Ta S1 Ta S .It strchrnul Ta Ta Ta S1 .It strcmp Ta Ta S Ta S1 Ta S @@ -217,7 +217,7 @@ for A .Nm manual page appeared in -.Fx 14.0 . +.Fx 14.1 . . .Sh AUTHOR .An Robert Clausecker Aq Mt fuz@FreeBSD.org -- 2.43.0 From 5fcf1d695c3c4f3487f329fba25714fa2fde01dc Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Thu, 9 Nov 2023 13:08:23 -0500 Subject: [PATCH 18/30] lib/libc/tests/string: add unit test for strlcpy A straightforward derivation from the stpncpy unit test. Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42863 --- lib/libc/tests/string/Makefile | 1 + lib/libc/tests/string/strlcpy_test.c | 183 +++++++++++++++++++++++++++ 2 files changed, 184 insertions(+) create mode 100644 lib/libc/tests/string/strlcpy_test.c diff --git a/lib/libc/tests/string/Makefile b/lib/libc/tests/string/Makefile index 5874f7b6b873..81e59ee65155 100644 --- a/lib/libc/tests/string/Makefile +++ b/lib/libc/tests/string/Makefile @@ -16,6 +16,7 @@ ATF_TESTS_C+= stpncpy_test ATF_TESTS_C+= strcmp2_test ATF_TESTS_C+= strcspn_test ATF_TESTS_C+= strerror2_test +ATF_TESTS_C+= strlcpy_test ATF_TESTS_C+= strspn_test ATF_TESTS_C+= strverscmp_test ATF_TESTS_C+= strxfrm_test diff --git a/lib/libc/tests/string/strlcpy_test.c b/lib/libc/tests/string/strlcpy_test.c new file mode 100644 index 000000000000..646bef42683e --- /dev/null +++ b/lib/libc/tests/string/strlcpy_test.c @@ -0,0 +1,183 @@ +/*- + * Copyright (c) 2009 David Schultz + * Copyright (c) 2023 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +size_t (*strlcpy_fn)(char *restrict, const char *restrict, size_t); + +static char * +makebuf(size_t len, int guard_at_end) +{ + char *buf; + size_t alloc_size, page_size; + + page_size = getpagesize(); + alloc_size = roundup2(len, page_size) + page_size; + + buf = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, MAP_ANON, -1, 0); + assert(buf); + if (guard_at_end) { + assert(munmap(buf + alloc_size - page_size, page_size) == 0); + return (buf + alloc_size - page_size - len); + } else { + assert(munmap(buf, page_size) == 0); + return (buf + page_size); + } +} + +static void +test_strlcpy(const char *s) +{ + char *src, *dst; + size_t size, bufsize, x; + int i, j; + + size = strlen(s) + 1; + for (i = 0; i <= 1; i++) { + for (j = 0; j <= 1; j++) { + for (bufsize = 0; bufsize <= size + 10; bufsize++) { + src = makebuf(size, i); + memcpy(src, s, size); + dst = makebuf(bufsize, j); + memset(dst, 'X', bufsize); + assert(strlcpy_fn(dst, src, bufsize) == size-1); + assert(bufsize == 0 || strncmp(src, dst, bufsize - 1) == 0); + for (x = size; x < bufsize; x++) + assert(dst[x] == 'X'); + } + } + } +} + +static void +test_sentinel(char *dest, char *src, size_t destlen, size_t srclen) +{ + size_t i; + size_t res, wantres; + const char *fail = NULL; + + for (i = 0; i < srclen; i++) + /* src will never include (){} */ + src[i] = '0' + i; + src[srclen] = '\0'; + + /* source sentinels: not to be copied */ + src[-1] = '('; + src[srclen+1] = ')'; + + memset(dest, '\xee', destlen); + + /* destination sentinels: not to be touched */ + dest[-1] = '{'; + dest[destlen] = '}'; + + wantres = srclen; + res = strlcpy_fn(dest, src, destlen); + + if (dest[-1] != '{') + fail = "start sentinel overwritten"; + else if (dest[destlen] != '}') + fail = "end sentinel overwritten"; + else if (res != wantres) + fail = "incorrect return value"; + else if (destlen > 0 && strncmp(src, dest, destlen - 1) != 0) + fail = "string not copied correctly"; + else if (destlen > 0 && srclen >= destlen - 1 && dest[destlen-1] != '\0') + fail = "string not NUL terminated"; + else for (i = srclen + 1; i < destlen; i++) + if (dest[i] != '\xee') { + fail = "buffer mutilated behind string"; + break; + } + + if (fail) + atf_tc_fail_nonfatal("%s\n" + "strlcpy(%p \"%s\", %p \"%s\", %zu) = %zu (want %zu)\n", + fail, dest, dest, src, src, destlen, res, wantres); +} + +ATF_TC_WITHOUT_HEAD(null); +ATF_TC_BODY(null, tc) +{ + ATF_CHECK_EQ(strlcpy_fn(NULL, "foo", 0), 3); +} + +ATF_TC_WITHOUT_HEAD(bounds); +ATF_TC_BODY(bounds, tc) +{ + size_t i; + char buf[64+1]; + + for (i = 0; i < sizeof(buf) - 1; i++) { + buf[i] = ' ' + i; + buf[i+1] = '\0'; + test_strlcpy(buf); + } +} + +ATF_TC_WITHOUT_HEAD(alignments); +ATF_TC_BODY(alignments, tc) +{ + size_t srcalign, destalign, srclen, destlen; + char src[15+3+64]; /* 15 offsets + 64 max length + NUL + sentinels */ + char dest[15+2+64]; /* 15 offsets + 64 max length + sentinels */ + + for (srcalign = 0; srcalign < 16; srcalign++) + for (destalign = 0; destalign < 16; destalign++) + for (srclen = 0; srclen < 64; srclen++) + for (destlen = 0; destlen < 64; destlen++) + test_sentinel(dest+destalign+1, + src+srcalign+1, destlen, srclen); +} + +ATF_TP_ADD_TCS(tp) +{ + void *dl_handle; + + dl_handle = dlopen(NULL, RTLD_LAZY); + strlcpy_fn = dlsym(dl_handle, "test_strlcpy"); + if (strlcpy_fn == NULL) + strlcpy_fn = strlcpy; + + ATF_TP_ADD_TC(tp, null); + ATF_TP_ADD_TC(tp, bounds); + ATF_TP_ADD_TC(tp, alignments); + + return (atf_no_error()); +} -- 2.43.0 From e3ee2445decb65f7b6c4f214ab965ee92719d898 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Sun, 12 Nov 2023 17:47:06 -0500 Subject: [PATCH 19/30] lib/libc/amd64/string: add strlcpy scalar, baseline implementation Somewhat similar to stpncpy, but different in that we need to compute the full source length even if the buffer is shorter than the source. strlcat is implemented as a simple wrapper around strlcpy. The scalar implementation of strlcpy just calls into strlen() and memcpy() to do the job. Perf-wise we're very close to stpncpy. The code is slightly slower as it needs to carry on with finding the source string length even if the buffer ends before the string. Sponsored by: The FreeBSD Foundation Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42863 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/strlcpy.S | 281 +++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+) create mode 100644 lib/libc/amd64/string/strlcpy.S diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index d982061e080b..03bca498e116 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -13,6 +13,7 @@ MDSRCS+= \ strcmp.S \ strcpy.c \ strcspn.S \ + strlcpy.S \ strlen.S \ strncmp.S \ strncpy.c \ diff --git a/lib/libc/amd64/string/strlcpy.S b/lib/libc/amd64/string/strlcpy.S new file mode 100644 index 000000000000..2b32c6c78047 --- /dev/null +++ b/lib/libc/amd64/string/strlcpy.S @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak strlcpy + .set strlcpy, __strlcpy +ARCHFUNCS(__strlcpy) + ARCHFUNC(__strlcpy, scalar) + ARCHFUNC(__strlcpy, baseline) +ENDARCHFUNCS(__strlcpy) + +ARCHENTRY(__strlcpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + push %rsi + push %rbx + push %rdi + push %rdx + mov %rsi, %rdi + call CNAME(strlen) # strlen(src) + pop %rdx + pop %rdi + mov -8(%rbp), %rsi + mov %rax, %rbx # remember string length for return value + sub $1, %rdx # do not copy into the final byte of the buffer + jc 0f # skip copying altogether if buffer was empty + cmp %rax, %rdx # is the buffer longer than the input? + cmova %rax, %rdx # if yes, only copy the part that fits + movb $0, (%rdi, %rdx, 1) # NUL-terminate output buffer + call CNAME(memcpy) # copy string to output +0: mov %rbx, %rax # restore return value + pop %rbx + leave + ret +ARCHEND(__strlcpy, scalar) + +ARCHENTRY(__strlcpy, baseline) + sub $1, %rdx # do not count NUL byte in buffer length + jb .L0 # go to special code path if len was 0 + + mov %esi, %ecx + pxor %xmm1, %xmm1 + mov %rsi, %r9 # stash a copy of the source pointer for later + and $~0xf, %rsi + pcmpeqb (%rsi), %xmm1 # NUL found in head? + mov $-1, %r8d + and $0xf, %ecx + shl %cl, %r8d # mask of bytes in the string + pmovmskb %xmm1, %eax + and %r8d, %eax + jnz .Lhead_nul + + movdqa 16(%rsi), %xmm3 # load second string chunk + movdqu (%r9), %xmm2 # load unaligned string head + mov $32, %r8d + sub %ecx, %r8d # head length + length of second chunk + pxor %xmm1, %xmm1 + pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? + + sub %r8, %rdx # enough space left for the second chunk? + jbe .Lhead_buf_end + + /* process second chunk */ + pmovmskb %xmm1, %eax + test %eax, %eax + jnz .Lsecond_nul + + /* string didn't end in second chunk and neither did buffer -- not a runt! */ + movdqa 32(%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + movdqu %xmm2, (%rdi) # deposit head into buffer + sub %rcx, %rdi # adjust RDI to correspond to RSI + movdqu %xmm3, 16(%rdi) # deposit second chunk + sub %rsi, %rdi # express RDI as distance from RSI + add $32, %rsi # advance RSI past first two chunks + sub $16, %rdx # enough left for another round? + jbe 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 3f + + movdqu %xmm0, (%rsi, %rdi) + movdqa 16(%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + cmp $16, %rdx # more than a full chunk left? + jbe 2f + + add $32, %rsi # advance pointers to next chunk + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 4f + + movdqu %xmm0, -16(%rsi, %rdi) + movdqa (%rsi), %xmm0 # load next string chunk + pxor %xmm1, %xmm1 + sub $32, %rdx + ja 0b + +1: sub $16, %rsi # undo second advancement + add $16, %edx + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? + pmovmskb %xmm0, %r8d + mov %r8d, %eax + bts %edx, %r8d # treat end of buffer as end of string + tzcnt %r8d, %r8d # find tail length + add %rsi, %rdi # restore RDI + movdqu (%rsi, %r8, 1), %xmm0 # load string tail + movdqu %xmm0, (%rdi, %r8, 1) # store string tail + movb $0, 16(%rdi, %r8, 1) # NUL terminate + + /* continue to find the end of the string */ + test %eax, %eax # end of string already reached? + jnz 1f + + ALIGN_TEXT +0: pcmpeqb 32(%rsi), %xmm1 + pmovmskb %xmm1, %eax + pxor %xmm1, %xmm1 + test %eax, %eax + jnz 2f + + pcmpeqb 48(%rsi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rsi + pxor %xmm1, %xmm1 + test %eax, %eax + jz 0b + +1: sub $16, %rsi # undo second advancement +2: tzcnt %eax, %eax # where is the NUL byte? + sub %r9, %rsi + lea 32(%rsi, %rax, 1), %rax # return string length + ret + +4: sub $16, %rsi # undo second advancement + add $16, %rdx # restore number of remaining bytes + + /* string has ended but buffer has not */ +3: tzcnt %eax, %eax # find length of string tail + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) + add %rsi, %rdi # restore destination pointer + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) + sub %r9, %rsi # string length to current chunk + add %rsi, %rax # plus length of current chunk + ret + +.Lhead_buf_end: + pmovmskb %xmm1, %r8d + add $32, %edx # restore edx to (len-1) + ecx + mov %r8d, %eax + shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 + bts %rdx, %r8 # treat end of buffer as end of string + tzcnt %r8, %rdx # find string/bufer len from alignment boundary + sub %ecx, %edx # find actual string/buffer len + movb $0, (%rdi, %rdx, 1) # write NUL terminator + + /* continue to find the end of the string */ + test %eax, %eax # end of string already reached? + jnz 1f + + ALIGN_TEXT +0: pcmpeqb 32(%rsi), %xmm1 + pmovmskb %xmm1, %eax + pxor %xmm1, %xmm1 + test %eax, %eax + jnz 2f + + pcmpeqb 48(%rsi), %xmm1 + pmovmskb %xmm1, %eax + add $32, %rsi + pxor %xmm1, %xmm1 + test %eax, %eax + jz 0b + +1: sub $16, %rsi +2: tzcnt %eax, %eax + sub %r9, %rsi + lea 32(%rsi, %rax, 1), %rax # return string length + jmp .L0031 + +.Lsecond_nul: + add %r8, %rdx # restore buffer length + tzcnt %eax, %eax # where is the NUL byte? + lea -16(%rcx), %r8d + sub %r8d, %eax # string length + cmp %rax, %rdx # is the string shorter than the buffer? + cmova %rax, %rdx # copy only min(buflen, srclen) bytes + movb $0, (%rdi, %rdx, 1) # write NUL terminator +.L0031: cmp $16, %rdx # at least 16 bytes to copy (not incl NUL)? + jb .L0015 + + /* copy 16--31 bytes */ + movdqu (%r9), %xmm0 # load first 16 bytes + movdqu -16(%r9, %rdx, 1), %xmm1 # load last 16 bytes + movdqu %xmm0, (%rdi) + movdqu %xmm1, -16(%rdi, %rdx, 1) + ret + +.Lhead_nul: + tzcnt %eax, %eax # where is the NUL byte? + sub %ecx, %eax # ... from the beginning of the string? + cmp %rax, %rdx # is the string shorter than the buffer? + cmova %rax, %rdx # copy only min(buflen, srclen) bytes + movb $0, (%rdi, %rdx, 1) # write NUL terminator + + /* process strings of 0--15 bytes (rdx: min(buflen, srclen), rax: srclen) */ +.L0015: cmp $8, %rdx # at least 8 bytes to copy? + jae .L0815 + + cmp $4, %rdx # at least 4 bytes to copy? + jae .L0407 + + cmp $2, %rdx # at least 2 bytes to copy? + jae .L0203 + + movzbl (%r9), %ecx # load first byte from src + mov %cl, (%rdi) # deposit into destination + movb $0, (%rdi, %rdx, 1) # add NUL terminator (again) + ret + +.L0203: movzwl (%r9), %ecx + movzwl -2(%r9, %rdx, 1), %esi + mov %cx, (%rdi) + mov %si, -2(%rdi, %rdx, 1) + ret + +.L0407: mov (%r9), %ecx + mov -4(%r9, %rdx, 1), %esi + mov %ecx, (%rdi) + mov %esi, -4(%rdi, %rdx, 1) + ret + +.L0815: mov (%r9), %rcx + mov -8(%r9, %rdx, 1), %rsi + mov %rcx, (%rdi) + mov %rsi, -8(%rdi, %rdx, 1) + ret + + /* length zero destination: just return the string length */ +.L0: mov %rsi, %rdi + jmp CNAME(strlen) +ARCHEND(__strlcpy, baseline) + + .section .note.GNU-stack,"",%progbits -- 2.43.0 From 54f000b18bec38aa8a2dd40582eb6bb31cae58df Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Tue, 28 Nov 2023 21:32:28 -0500 Subject: [PATCH 20/30] lib/libc/amd64/string: implement strlcat() through strlcpy() This should pick up our optimised memchr(), strlen(), and strlcpy() when strlcat() is called. Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42863 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/strlcat.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 lib/libc/amd64/string/strlcat.c diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 03bca498e116..2b1e276cb3da 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -13,6 +13,7 @@ MDSRCS+= \ strcmp.S \ strcpy.c \ strcspn.S \ + strlcat.c \ strlcpy.S \ strlen.S \ strncmp.S \ diff --git a/lib/libc/amd64/string/strlcat.c b/lib/libc/amd64/string/strlcat.c new file mode 100644 index 000000000000..0c1e1c5d05f7 --- /dev/null +++ b/lib/libc/amd64/string/strlcat.c @@ -0,0 +1,25 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include + +#include + +void *__memchr(const void *, int, size_t); +size_t __strlcpy(char *restrict, const char *restrict, size_t); + +size_t +strlcat(char *restrict dst, const char *restrict src, size_t dstsize) +{ + char *loc = __memchr(dst, '\0', dstsize); + + if (loc != NULL) { + size_t dstlen = (size_t)(loc - dst); + + return (dstlen + __strlcpy(loc, src, dstsize - dstlen)); + } else + return (dstsize + strlen(src)); +} -- 2.43.0 From 8d284959716f6d7972df311310f9a70b24ab35b7 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Tue, 28 Nov 2023 21:35:45 -0500 Subject: [PATCH 21/30] share/man/man7/simd.7: document scalar/baseline strlcpy, strlcat Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42863 --- share/man/man7/simd.7 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index 6ddf769b6d38..e025d3ac9f12 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd November 14, 2023 +.Dd November 28, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -72,6 +72,8 @@ Enhanced functions are present for the following architectures: .It strcmp Ta Ta S Ta S1 Ta S .It strcpy Ta Ta Ta S1 Ta S Ta S2 .It strcspn Ta Ta Ta S2 +.It strlcat Ta Ta Ta S1 +.It strlcpy Ta Ta Ta S1 .It strlen Ta Ta S Ta S1 .It strncmp Ta Ta S Ta S1 Ta S .It strncpy Ta Ta Ta S1 Ta Ta S2 -- 2.43.0 From 6a070668cd28a0d84b11fdaef64f59c0c7f1f175 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Tue, 28 Nov 2023 22:33:18 -0500 Subject: [PATCH 22/30] share/man/man7/simd.7: add forgotten aarch64 string functions I previously forgot to mention these as they are set up through contrib/arm-optimized/routines/string. Approved by: mjg (blanket, via IRC) MFC-after: 1 week --- share/man/man7/simd.7 | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index e025d3ac9f12..39eafc0e8c5a 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -58,26 +58,26 @@ Enhanced functions are present for the following architectures: .It index Ta S Ta Ta S1 .It ldiv Ta Ta Ta S Ta S .It lldiv Ta Ta Ta S -.It memchr Ta Ta Ta S1 -.It memcmp Ta Ta S Ta S1 Ta S +.It memchr Ta S Ta Ta S1 +.It memcmp Ta S Ta S Ta S1 Ta S .It memcpy Ta S Ta S Ta S Ta S Ta SV .It memmove Ta S Ta S Ta S Ta S Ta SV -.It memset Ta Ta S Ta S Ta S +.It memset Ta S Ta S Ta S Ta S .It rindex Ta S Ta Ta S1 Ta S -.It stpcpy Ta Ta Ta S1 +.It stpcpy Ta S Ta Ta S1 .It stpncpy Ta Ta Ta S1 .It strcat Ta Ta Ta S1 Ta S .It strchr Ta S Ta Ta S1 Ta S -.It strchrnul Ta Ta Ta S1 -.It strcmp Ta Ta S Ta S1 Ta S -.It strcpy Ta Ta Ta S1 Ta S Ta S2 +.It strchrnul Ta S Ta Ta S1 +.It strcmp Ta S Ta S Ta S1 Ta S +.It strcpy Ta S Ta Ta S1 Ta S Ta S2 .It strcspn Ta Ta Ta S2 .It strlcat Ta Ta Ta S1 .It strlcpy Ta Ta Ta S1 -.It strlen Ta Ta S Ta S1 -.It strncmp Ta Ta S Ta S1 Ta S +.It strlen Ta S Ta S Ta S1 +.It strncmp Ta S Ta S Ta S1 Ta S .It strncpy Ta Ta Ta S1 Ta Ta S2 -.It strnlen Ta Ta Ta S1 +.It strnlen Ta S Ta Ta S1 .It strrchr Ta S Ta Ta S1 Ta S .It strpbrk Ta Ta Ta S2 .It strsep Ta Ta Ta S2 -- 2.43.0 From 8cd7ee84f5c833001da1936c36c7ca951a45d25c Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Sun, 3 Dec 2023 06:42:17 -0500 Subject: [PATCH 23/30] lib/libc/tests/string: add unit tests for memccpy() Adapted from the strlcpy() unit tests. --- lib/libc/tests/string/Makefile | 1 + lib/libc/tests/string/memccpy_test.c | 205 +++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 lib/libc/tests/string/memccpy_test.c diff --git a/lib/libc/tests/string/Makefile b/lib/libc/tests/string/Makefile index 81e59ee65155..a6e8eb18075a 100644 --- a/lib/libc/tests/string/Makefile +++ b/lib/libc/tests/string/Makefile @@ -9,6 +9,7 @@ ATF_TESTS_C+= ffsll_test ATF_TESTS_C+= fls_test ATF_TESTS_C+= flsl_test ATF_TESTS_C+= flsll_test +ATF_TESTS_C+= memccpy_test ATF_TESTS_C+= memcmp_test ATF_TESTS_C+= memset_s_test ATF_TESTS_C+= strncmp_test diff --git a/lib/libc/tests/string/memccpy_test.c b/lib/libc/tests/string/memccpy_test.c new file mode 100644 index 000000000000..82f4ef34af54 --- /dev/null +++ b/lib/libc/tests/string/memccpy_test.c @@ -0,0 +1,205 @@ +/*- + * Copyright (c) 2009 David Schultz + * Copyright (c) 2023 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +void *(*memccpy_fn)(void *restrict, const void *restrict, int, size_t); + +static char * +makebuf(size_t len, int guard_at_end) +{ + char *buf; + size_t alloc_size, page_size; + + page_size = getpagesize(); + alloc_size = roundup2(len, page_size) + page_size; + + buf = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE, MAP_ANON, -1, 0); + assert(buf); + if (guard_at_end) { + assert(munmap(buf + alloc_size - page_size, page_size) == 0); + return (buf + alloc_size - page_size - len); + } else { + assert(munmap(buf, page_size) == 0); + return (buf + page_size); + } +} + +static void +test_memccpy(const char *s) +{ + char *src, *dst, *expected; + size_t size, bufsize, x; + int i, j; + + size = strlen(s) + 1; + for (i = 0; i <= 1; i++) { + for (j = 0; j <= 1; j++) { + for (bufsize = 0; bufsize <= size + 10; bufsize++) { + src = makebuf(size, i); + memcpy(src, s, size); + dst = makebuf(bufsize, j); + memset(dst, 'X', bufsize); + expected = bufsize >= size ? dst + size : NULL; + assert(memccpy_fn(dst, src, src[size-1], bufsize) == expected); + assert(bufsize == 0 || strncmp(src, dst, bufsize - 1) == 0); + for (x = size; x < bufsize; x++) + assert(dst[x] == 'X'); + } + } + } +} + +static void +test_sentinel(char *dest, char *src, size_t destlen, size_t srclen) +{ + size_t i, effective_len; + void *res, *wantres; + const char *fail = NULL; + char terminator; + + for (i = 0; i < srclen; i++) + /* src will never include (){} */ + src[i] = '0' + i; + + /* source sentinels: not to be copied */ + src[-1] = '('; + src[srclen] = ')'; + + memset(dest, '\xee', destlen); + + /* destination sentinels: not to be touched */ + dest[-1] = '{'; + dest[destlen] = '}'; + + effective_len = srclen < destlen ? srclen : destlen; + wantres = srclen <= destlen ? dest + srclen : NULL; + terminator = src[srclen-1]; + res = memccpy_fn(dest, src, terminator, destlen); + + if (dest[-1] != '{') + fail = "start sentinel overwritten"; + else if (dest[destlen] != '}') + fail = "end sentinel overwritten"; + else if (res != wantres) + fail = "incorrect return value"; + else if (destlen > 0 && memcmp(src, dest, effective_len) != 0) + fail = "string not copied correctly"; + else for (i = srclen; i < destlen; i++) + if (dest[i] != '\xee') { + fail = "buffer mutilated behind string"; + break; + } + + if (fail) + atf_tc_fail_nonfatal("%s\n" + "memccpy(%p \"%s\", %p \"%s\", %u '%c', %zu) = %p (want %p)\n", + fail, dest, dest, src, src, terminator, terminator, destlen, res, wantres); +} + +ATF_TC_WITHOUT_HEAD(null); +ATF_TC_BODY(null, tc) +{ + ATF_CHECK_EQ(memccpy_fn(NULL, "foo", 42, 0), NULL); +} + +ATF_TC(zero_extension); +ATF_TC_HEAD(zero_extension, tc) +{ + atf_tc_set_md_var(tc, "descr", + "Ensure the upper bits of the terminator are ignored"); +} +ATF_TC_BODY(zero_extension, tc) +{ + int mask = -1 & ~UCHAR_MAX; + char buf[16]; + + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ(memccpy(buf, "foobar", 'r', sizeof(buf)), buf + sizeof("foobar") - 1); + ATF_CHECK_EQ(memcmp(buf, "foobar", sizeof("foobar") - 1), 0); + + memset(buf, 0xcc, sizeof(buf)); + ATF_CHECK_EQ(memccpy(buf, "foobar", mask | 'r', sizeof(buf)), buf + sizeof("foobar") - 1); + ATF_CHECK_EQ(memcmp(buf, "foobar", sizeof("foobar") - 1), 0); +} + +ATF_TC_WITHOUT_HEAD(bounds); +ATF_TC_BODY(bounds, tc) +{ + size_t i; + char buf[64]; + + for (i = 0; i < sizeof(buf) - 1; i++) { + buf[i] = ' ' + i; + test_memccpy(buf); + } +} + +ATF_TC_WITHOUT_HEAD(alignments); +ATF_TC_BODY(alignments, tc) +{ + size_t srcalign, destalign, srclen, destlen; + char src[15+2+64]; /* 15 offsets + 64 max length + sentinels */ + char dest[15+2+64]; /* 15 offsets + 64 max length + sentinels */ + + for (srcalign = 0; srcalign < 16; srcalign++) + for (destalign = 0; destalign < 16; destalign++) + for (srclen = 1; srclen < 64; srclen++) + for (destlen = 0; destlen < 64; destlen++) + test_sentinel(dest+destalign+1, + src+srcalign+1, destlen, srclen); +} + +ATF_TP_ADD_TCS(tp) +{ + void *dl_handle; + + dl_handle = dlopen(NULL, RTLD_LAZY); + memccpy_fn = dlsym(dl_handle, "test_memccpy"); + if (memccpy_fn == NULL) + memccpy_fn = memccpy; + + ATF_TP_ADD_TC(tp, null); + ATF_TP_ADD_TC(tp, zero_extension); + ATF_TP_ADD_TC(tp, bounds); + ATF_TP_ADD_TC(tp, alignments); + + return (atf_no_error()); +} -- 2.43.0 From f7dc16b47c797e7cd265b91abfaff6de84126d47 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Sat, 2 Dec 2023 07:28:05 -0500 Subject: [PATCH 24/30] lib/libc/amd64/string: add memccpy scalar, baseline implementation Based on the strlcpy code from D42863, this patch adds a SIMD-enhanced implementation of memccpy for amd64. A scalar implementation calling into memchr and memcpy to do the job is provided, too. Please note that this code does not behave exactly the same as the C implementation of memccpy for overlapping inputs. However, overlapping inputs are not allowed for this function by ISO/IEC 9899:1999 and neither has the C implementation any code to deal with the possibility. It just proceeds byte-by-byte, which may or may not do the expected thing for some overlaps. We do not document whether overlapping inputs are supported in memccpy(3). Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42902 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/memccpy.S | 259 +++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100644 lib/libc/amd64/string/memccpy.S diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index 2b1e276cb3da..b569d2cb8be8 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -3,6 +3,7 @@ MDSRCS+= \ bcmp.S \ memchr.S \ memcmp.S \ + memccpy.S \ memcpy.S \ memmove.S \ memset.S \ diff --git a/lib/libc/amd64/string/memccpy.S b/lib/libc/amd64/string/memccpy.S new file mode 100644 index 000000000000..a2d9e33b3d36 --- /dev/null +++ b/lib/libc/amd64/string/memccpy.S @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2023 The FreeBSD Foundation + * + * This software was developed by Robert Clausecker + * under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE + */ + +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + + .weak memccpy + .set memccpy, __memccpy +ARCHFUNCS(__memccpy) + ARCHFUNC(__memccpy, scalar) + ARCHFUNC(__memccpy, baseline) +ENDARCHFUNCS(__memccpy) + +ARCHENTRY(__memccpy, scalar) + push %rbp # establish stack frame + mov %rsp, %rbp + push %rax # dummy push for alignment + push %rbx + push %rdi + push %rsi + + mov %rsi, %rdi + mov %edx, %esi + mov %rcx, %rdx + mov %rcx, %rbx + call CNAME(__memchr) # ptr = memchr(src, c, len) + + pop %rsi + pop %rdi + lea 1(%rax), %rdx + sub %rsi, %rdx # size = ptr - src + 1 + mov %rbx, %rcx + lea (%rdi, %rdx, 1), %rbx # res = dest + size + test %rax, %rax # if (ptr == NULL) + cmovz %rcx, %rdx # size = len + cmovz %rax, %rbx # res = NULL + call CNAME(memcpy) + + mov %rbx, %rax # return (res) + pop %rbx + leave + ret +ARCHEND(__memccpy, scalar) + +ARCHENTRY(__memccpy, baseline) + sub $1, %rcx # RCX refers to last character in buffer + jb .L0 # go to special code path if len was 0 + + movd %edx, %xmm4 + mov %rcx, %rdx + punpcklbw %xmm4, %xmm4 # c -> cc + mov %esi, %ecx + punpcklwd %xmm4, %xmm4 # cc -> cccc + mov %rsi, %r9 # stash a copy of the source pointer for later + pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc + and $~0xf, %rsi + movdqa %xmm4, %xmm1 + pcmpeqb (%rsi), %xmm1 # NUL found in head? + mov $-1, %r8d + and $0xf, %ecx + shl %cl, %r8d # mask of bytes in the string + pmovmskb %xmm1, %eax + and %r8d, %eax + jnz .Lhead_nul + + movdqa 16(%rsi), %xmm3 # load second string chunk + movdqu (%r9), %xmm2 # load unaligned string head + mov $32, %r8d + sub %ecx, %r8d # head length + length of second chunk + movdqa %xmm4, %xmm1 + pcmpeqb %xmm3, %xmm1 # NUL found in second chunk? + + sub %r8, %rdx # enough space left for the second chunk? + jb .Lhead_buf_end + + /* process second chunk */ + pmovmskb %xmm1, %eax + test %eax, %eax + jnz .Lsecond_nul + + /* string didn't end in second chunk and neither did buffer -- not a runt! */ + movdqa 32(%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + movdqu %xmm2, (%rdi) # deposit head into buffer + sub %rcx, %rdi # adjust RDI to correspond to RSI + movdqu %xmm3, 16(%rdi) # deposit second chunk + sub %rsi, %rdi # express RDI as distance from RSI + add $32, %rsi # advance RSI past first two chunks + sub $16, %rdx # enough left for another round? + jb 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 3f + + movdqu %xmm0, (%rsi, %rdi) + movdqa 16(%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + cmp $16, %rdx # more than a full chunk left? + jb 2f + + add $32, %rsi # advance pointers to next chunk + pcmpeqb %xmm0, %xmm1 # NUL byte encountered? + pmovmskb %xmm1, %eax + test %eax, %eax + jnz 4f + + movdqu %xmm0, -16(%rsi, %rdi) + movdqa (%rsi), %xmm0 # load next string chunk + movdqa %xmm4, %xmm1 + sub $32, %rdx + jae 0b + +1: sub $16, %rsi # undo second advancement + add $16, %edx + + /* 1--16 bytes left in the buffer but string has not ended yet */ +2: pcmpeqb %xmm1, %xmm0 # NUL byte encountered? + pmovmskb %xmm0, %r8d + mov %r8d, %ecx + bts %edx, %r8d # treat end of buffer as end of string + or $0x10000, %eax # ensure TZCNT finds a set bit + tzcnt %r8d, %r8d # find tail length + add %rsi, %rdi # restore RDI + movdqu 1(%rsi, %r8, 1), %xmm0 # load string tail + movdqu %xmm0, 1(%rdi, %r8, 1) # store string tail + lea 17(%rdi, %r8, 1), %rsi # return value if terminator encountered + xor %eax, %eax # return value if no terminator encountered + bt %r8d, %ecx # terminator encountered inside buffer? + cmovc %rsi, %rax # if yes, return pointer, else NULL + ret + +4: sub $16, %rsi # undo second advancement + add $16, %rdx # restore number of remaining bytes + + /* string has ended but buffer has not */ +3: tzcnt %eax, %eax # find length of string tail + movdqu -15(%rsi, %rax, 1), %xmm0 # load string tail (incl. NUL) + add %rsi, %rdi # restore destination pointer + movdqu %xmm0, -15(%rdi, %rax, 1) # store string tail (incl. NUL) + lea 1(%rdi, %rax, 1), %rax # compute return value + ret + +.Lhead_buf_end: + pmovmskb %xmm1, %r8d + add $32, %edx # restore edx to (len-1) + ecx + shl $16, %r8d # place 2nd chunk NUL mask into bits 16--31 + mov %r8d, %r10d + bts %rdx, %r8 # treat end of buffer as if terminator present + xor %eax, %eax # return value if terminator not found + tzcnt %r8, %rdx # find string/buffer len from alignment boundary + lea 1(%rdi, %rdx, 1), %r8 # return value if terminator found + rcx + sub %rcx, %r8 # subtract rcx + bt %rdx, %r10 # was the terminator present? + cmovc %r8, %rax # if yes, return pointer, else NULL + sub %ecx, %edx # find actual string/buffer len + jmp .L0132 + +.Lsecond_nul: + add %r8, %rdx # restore buffer length + tzcnt %eax, %r8d # where is the NUL byte? + lea -16(%rcx), %eax + sub %eax, %r8d # string length + lea 1(%rdi, %r8, 1), %rax # return value if NUL before end of buffer + xor %ecx, %ecx # return value if not + cmp %r8, %rdx # is the string shorter than the buffer? + cmova %r8, %rdx # copy only min(buflen, srclen) bytes + cmovb %rcx, %rax # return NUL if buffer ended before string +.L0132: cmp $16, %rdx # at least 17 bytes to copy (not incl NUL)? + jb .L0116 + + /* copy 17--32 bytes */ + movdqu (%r9), %xmm0 # load first 16 bytes + movdqu -15(%r9, %rdx, 1), %xmm1 # load last 16 bytes + movdqu %xmm0, (%rdi) + movdqu %xmm1, -15(%rdi, %rdx, 1) + ret + +.Lhead_nul: + tzcnt %eax, %r8d # where is the NUL byte? + sub %ecx, %r8d # ... from the beginning of the string? + lea 1(%rdi, %r8, 1), %rax # return value if NUL before end of buffer + xor %ecx, %ecx # return value if not + cmp %r8, %rdx # is the string shorter than the buffer? + cmova %r8, %rdx # copy only min(buflen, srclen) bytes + cmovb %rcx, %rax # return NUL if buffer ended before string + + /* process strings of 1--16 bytes (rdx: min(buflen, srclen), rax: srclen) */ +.L0116: cmp $8, %rdx # at least 9 bytes to copy? + jae .L0916 + + cmp $4, %rdx # at least 5 bytes to copy? + jae .L0508 + + cmp $2, %rdx # at least 3 bytes to copy? + jae .L0304 + + /* copy one or two bytes */ + movzbl (%r9), %ecx # load first byte from src + movzbl (%r9, %rdx, 1), %esi # load last byte from src + mov %cl, (%rdi) # deposit into destination + mov %sil, (%rdi, %rdx, 1) + ret + +.L0304: movzwl (%r9), %ecx + movzwl -1(%r9, %rdx, 1), %esi + mov %cx, (%rdi) + mov %si, -1(%rdi, %rdx, 1) + ret + +.L0508: mov (%r9), %ecx + mov -3(%r9, %rdx, 1), %esi + mov %ecx, (%rdi) + mov %esi, -3(%rdi, %rdx, 1) + ret + +.L0916: mov (%r9), %rcx + mov -7(%r9, %rdx, 1), %rsi + mov %rcx, (%rdi) + mov %rsi, -7(%rdi, %rdx, 1) + ret + + /* length zero destination: return null pointer */ +.L0: xor %eax, %eax + ret +ARCHEND(__memccpy, baseline) + + .section .note.GNU-stack,"",%progbits -- 2.43.0 From 6afe28000e52ec6032e4b7868fd52ed1ff0d6556 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 4 Dec 2023 12:16:50 -0500 Subject: [PATCH 25/30] share/man/man7/simd.7: document simd-enhanced memccpy, strncat Approved by: ... MFC after: 1 week Differential Revision: HTTPS://reviews.freebsd.org/D42902 --- share/man/man7/simd.7 | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index 39eafc0e8c5a..281898839c24 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd November 28, 2023 +.Dd December 4, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -60,6 +60,7 @@ Enhanced functions are present for the following architectures: .It lldiv Ta Ta Ta S .It memchr Ta S Ta Ta S1 .It memcmp Ta S Ta S Ta S1 Ta S +.It memccpy Ta Ta Ta S1 .It memcpy Ta S Ta S Ta S Ta S Ta SV .It memmove Ta S Ta S Ta S Ta S Ta SV .It memset Ta S Ta S Ta S Ta S @@ -75,6 +76,7 @@ Enhanced functions are present for the following architectures: .It strlcat Ta Ta Ta S1 .It strlcpy Ta Ta Ta S1 .It strlen Ta S Ta S Ta S1 +.It strncat Ta Ta Ta S1 .It strncmp Ta S Ta S Ta S1 Ta S .It strncpy Ta Ta Ta S1 Ta Ta S2 .It strnlen Ta S Ta Ta S1 -- 2.43.0 From 1cb25aa66704177a0207fc2a9f8792d08963d04f Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Mon, 4 Dec 2023 12:32:49 -0500 Subject: [PATCH 26/30] lib/libc/amd64/string: implement strncat() by calling strlen(), memccpy() This picks up the accelerated implementation of memccpy(). Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42902 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/strncat.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 lib/libc/amd64/string/strncat.c diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index b569d2cb8be8..a14e8a768f01 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -17,6 +17,7 @@ MDSRCS+= \ strlcat.c \ strlcpy.S \ strlen.S \ + strncat.c \ strncmp.S \ strncpy.c \ strnlen.c \ diff --git a/lib/libc/amd64/string/strncat.c b/lib/libc/amd64/string/strncat.c new file mode 100644 index 000000000000..33b278ac5e04 --- /dev/null +++ b/lib/libc/amd64/string/strncat.c @@ -0,0 +1,29 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include + +#include + +void *__memccpy(void *restrict, const void *restrict, int, size_t); + +char * +strncat(char *dest, const char *src, size_t n) +{ + size_t len; + char *endptr; + + len = strlen(dest); + endptr = __memccpy(dest + len, src, '\0', n); + + /* avoid an extra branch */ + if (endptr == NULL) + endptr = dest + len + n + 1; + + endptr[-1] = '\0'; + + return (dest); +} -- 2.43.0 From 725e8ab9bbff073f6b02fb00234a775f70557ba5 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Tue, 5 Dec 2023 09:03:28 -0500 Subject: [PATCH 27/30] lib/libc/string: document restrict qualification of memccpy() arguments POSIX.1-2004 and the upcoming C23 agree that memccpy()'s arguments are restrict qualified and must not overlap. In 2002, restrict qualifiers were added to 's declaration of the function. Make things official and document that the arguments must not overlap. See also: 61b60edfd3fff20f884419f8097870c7045315c9 Approved by: kib MFC after: 1 week --- lib/libc/string/bstring.3 | 10 ++++++++-- lib/libc/string/memccpy.3 | 33 ++++++++++++++++++++++++++++++--- lib/libc/string/memccpy.c | 2 +- 3 files changed, 39 insertions(+), 6 deletions(-) diff --git a/lib/libc/string/bstring.3 b/lib/libc/string/bstring.3 index fd976c7676b7..91603fe6dbac 100644 --- a/lib/libc/string/bstring.3 +++ b/lib/libc/string/bstring.3 @@ -27,7 +27,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd June 4, 1993 +.Dd December 5, 2023 .Dt BSTRING 3 .Os .Sh NAME @@ -56,7 +56,12 @@ .Ft int .Fn memcmp "const void *b1" "const void *b2" "size_t len" .Ft void * -.Fn memccpy "void *dst" "const void *src" "int c" "size_t len" +.Fo memccpy +.Fa "void * restrict dst" +.Fa "const void * restrict src" +.Fa "int c" +.Fa "size_t len" +.Fc .Ft void * .Fn memcpy "void *dst" "const void *src" "size_t len" .Ft void * @@ -78,6 +83,7 @@ See the specific manual pages for more information. .Xr memccpy 3 , .Xr memchr 3 , .Xr memcmp 3 , +.Xr memccpy 3 , .Xr memcpy 3 , .Xr memmove 3 , .Xr memset 3 diff --git a/lib/libc/string/memccpy.3 b/lib/libc/string/memccpy.3 index ce8d5f65ac93..3bdae24354c1 100644 --- a/lib/libc/string/memccpy.3 +++ b/lib/libc/string/memccpy.3 @@ -25,7 +25,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd June 9, 1993 +.Dd December 5, 2023 .Dt MEMCCPY 3 .Os .Sh NAME @@ -36,7 +36,12 @@ .Sh SYNOPSIS .In string.h .Ft void * -.Fn memccpy "void *dst" "const void *src" "int c" "size_t len" +.Fo memccpy +.Fa "void * restrict dst" +.Fa "const void * restrict src" +.Fa "int c" +.Fa "size_t len" +.Fc .Sh DESCRIPTION The .Fn memccpy @@ -59,13 +64,35 @@ is returned. Otherwise, .Fa len bytes are copied, and a NULL pointer is returned. +If +.Fa src +and +.Fa dst +overlap, behavior is undefined. .Sh SEE ALSO .Xr bcopy 3 , .Xr memcpy 3 , .Xr memmove 3 , .Xr strcpy 3 +.Sh STANDARDS +The +.Fn memccpy +function conforms to +.St -p1003.1-2004 +and +.\" St -isoC-2024 . +ISO/IEC 9899:2024 (\(lqISO\~C23\(rq). .Sh HISTORY The .Fn memccpy function first appeared in -.Bx 4.4 . +.Bx 4.4 +and was first specified in the +.\" St -svid1 . +System\~V Interface Definition, First Edition (\(lqSVID1\(rq). +The +.Ft restrict +keyword was added to the prototype in +.Fx 5.0.0 +in accordance with the updated specification of +.St -p1003.1-2004 . diff --git a/lib/libc/string/memccpy.c b/lib/libc/string/memccpy.c index 174824ba2393..d6a446503eb6 100644 --- a/lib/libc/string/memccpy.c +++ b/lib/libc/string/memccpy.c @@ -32,7 +32,7 @@ #include void * -memccpy(void *t, const void *f, int c, size_t n) +memccpy(void * restrict t, const void * restrict f, int c, size_t n) { if (n) { -- 2.43.0 From 97b679b9058988ef0b80075c3943e3d5902b59d4 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Wed, 6 Dec 2023 04:11:40 -0500 Subject: [PATCH 28/30] lib/libc/tests/string: add memrchr unit tests The "values" test case is specifically crafter to detect the off-by-one error previous discovered in the scalar strchrnul implementation. Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42925 --- lib/libc/tests/string/Makefile | 1 + lib/libc/tests/string/memrchr_test.c | 116 +++++++++++++++++++++++++++ 2 files changed, 117 insertions(+) create mode 100644 lib/libc/tests/string/memrchr_test.c diff --git a/lib/libc/tests/string/Makefile b/lib/libc/tests/string/Makefile index a6e8eb18075a..4fce79685c0e 100644 --- a/lib/libc/tests/string/Makefile +++ b/lib/libc/tests/string/Makefile @@ -11,6 +11,7 @@ ATF_TESTS_C+= flsl_test ATF_TESTS_C+= flsll_test ATF_TESTS_C+= memccpy_test ATF_TESTS_C+= memcmp_test +ATF_TESTS_C+= memrchr_test ATF_TESTS_C+= memset_s_test ATF_TESTS_C+= strncmp_test ATF_TESTS_C+= stpncpy_test diff --git a/lib/libc/tests/string/memrchr_test.c b/lib/libc/tests/string/memrchr_test.c new file mode 100644 index 000000000000..12f696c9dc1e --- /dev/null +++ b/lib/libc/tests/string/memrchr_test.c @@ -0,0 +1,116 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include + +#include +#include +#include + +#include + +static void *(*memrchr_fn)(const void *, int, size_t); + +ATF_TC_WITHOUT_HEAD(null); +ATF_TC_BODY(null, tc) +{ + ATF_CHECK_EQ(memrchr_fn(NULL, 42, 0), NULL); +} + +ATF_TC_WITHOUT_HEAD(not_found); +ATF_TC_BODY(not_found, tc) +{ + size_t i, j; + char buf[1+15+64+1]; /* offset [0..15] + 64 buffer bytes + sentinels */ + + buf[0] = 'X'; + memset(buf + 1, '-', sizeof(buf) - 1); + + for (i = 0; i < 16; i++) + for (j = 0; j < 64; j++) { + buf[i + j + 1] = 'X'; + ATF_CHECK_EQ(memrchr_fn(buf + i + 1, 'X', j), NULL); + buf[i + j + 1] = '-'; + } +} + +static void +do_found_test(char buf[], size_t len, size_t first, size_t second) +{ + /* invariant: first <= second */ + + buf[first] = 'X'; + buf[second] = 'X'; + ATF_CHECK_EQ(memrchr_fn(buf, 'X', len), buf + second); + buf[first] = '-'; + buf[second] = '-'; +} + +ATF_TC_WITHOUT_HEAD(found); +ATF_TC_BODY(found, tc) +{ + size_t i, j, k, l; + char buf[1+15+64+1]; + + buf[0] = 'X'; + memset(buf + 1, '-', sizeof(buf) - 1); + + for (i = 0; i < 16; i++) + for (j = 0; j < 64; j++) + for (k = 0; k < j; k++) + for (l = 0; l <= k; l++) { + buf[i + j + 1] = 'X'; + do_found_test(buf + i + 1, j, l, k); + buf[i + j + 1] = '-'; + } +} + +/* check that the right character is found */ +static void +do_values_test(unsigned char buf[], size_t len, size_t i, int c) +{ + /* sentinels */ + buf[-1] = c; + buf[len] = c; + memset(buf, c + 1, len); + + if (i < len) { + buf[i] = c; + ATF_CHECK_EQ(memrchr_fn(buf, c, len), buf + i); + } else + ATF_CHECK_EQ(memrchr_fn(buf, c, len), NULL); +} + +ATF_TC_WITHOUT_HEAD(values); +ATF_TC_BODY(values, tc) +{ + size_t i, j, k; + int c; + unsigned char buf[1+15+64+1]; + + for (i = 0; i < 16; i++) + for (j = 0; j < 64; j++) + for (k = 0; k <= j; k++) + for (c = 0; c <= UCHAR_MAX; c++) + do_values_test(buf + i + 1, j, k, c); +} + +ATF_TP_ADD_TCS(tp) +{ + void *dl_handle; + + dl_handle = dlopen(NULL, RTLD_LAZY); + memrchr_fn = dlsym(dl_handle, "test_memrchr"); + if (memrchr_fn == NULL) + memrchr_fn = memrchr; + + ATF_TP_ADD_TC(tp, null); + ATF_TP_ADD_TC(tp, not_found); + ATF_TP_ADD_TC(tp, found); + ATF_TP_ADD_TC(tp, values); + + return (atf_no_error()); +} -- 2.43.0 From 6bc3a65c85c7bee4fd188e90c97cb8675cf64285 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Wed, 6 Dec 2023 05:05:47 -0500 Subject: [PATCH 29/30] lib/libc/amd64/string: add memrchr() scalar, baseline implementation The scalar implementation is fairly simplistic and only performs slightly better than the generic C implementation. It could be improved by using the same algorithm as for memchr, but it would have been a lot more complicated. The baseline implementation is similar to timingsafe_memcmp. It's slightly slower than memchr() due to the more complicated main loop, but I don't think that can be significantly improved. Approved by: ... MFC after: 1 week Differential Revision: https://reviews.freebsd.org/D42925 --- lib/libc/amd64/string/Makefile.inc | 1 + lib/libc/amd64/string/memrchr.S | 166 +++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 lib/libc/amd64/string/memrchr.S diff --git a/lib/libc/amd64/string/Makefile.inc b/lib/libc/amd64/string/Makefile.inc index a14e8a768f01..b1369841bc74 100644 --- a/lib/libc/amd64/string/Makefile.inc +++ b/lib/libc/amd64/string/Makefile.inc @@ -6,6 +6,7 @@ MDSRCS+= \ memccpy.S \ memcpy.S \ memmove.S \ + memrchr.S \ memset.S \ stpcpy.S \ stpncpy.S \ diff --git a/lib/libc/amd64/string/memrchr.S b/lib/libc/amd64/string/memrchr.S new file mode 100644 index 000000000000..4f6c5a238daa --- /dev/null +++ b/lib/libc/amd64/string/memrchr.S @@ -0,0 +1,166 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause + * + * Copyright (c) 2023 Robert Clausecker + */ + +#include + +#include "amd64_archlevel.h" + +#define ALIGN_TEXT .p2align 4, 0x90 + +ARCHFUNCS(memrchr) + ARCHFUNC(memrchr, scalar) + ARCHFUNC(memrchr, baseline) +ENDARCHFUNCS(memrchr) + +ARCHENTRY(memrchr, scalar) + xor %eax, %eax # prospective return value + sub $4, %rdx # 4 bytes left to process? + jb 1f + + ALIGN_TEXT +0: xor %r8, %r8 + lea 2(%rdi), %r10 + cmp %sil, 2(%rdi) + cmovne %r8, %r10 # point to null if no match + + cmp %sil, (%rdi) + cmove %rdi, %r8 # point to first char if match + + lea 1(%rdi), %r9 + cmp %sil, 1(%rdi) + cmovne %r8, %r9 # point to first result if no match in second + + lea 3(%rdi), %r11 + cmp %sil, 3(%rdi) + cmovne %r10, %r11 + + test %r11, %r11 + cmovz %r9, %r11 # take first pair match if none in second + + test %r11, %r11 + cmovnz %r11, %rax # take match in current set if any + + add $4, %rdi + sub $4, %rdx + jae 0b + +1: cmp $-3, %edx # a least one character left to process? + jb 2f + + cmp %sil, (%rdi) + cmove %rdi, %rax + + lea 1(%rdi), %rcx + cmp $-2, %edx # at least two characters left to process? + jb 2f + + cmp %sil, 1(%rdi) + cmove %rcx, %rax + + lea 2(%rdi), %rcx + cmp $-1, %edx # at least three character left to process? + jb 2f + + cmp %sil, 2(%rdi) + cmove %rcx, %rax + +2: ret +ARCHEND(memrchr, scalar) + +ARCHENTRY(memrchr, baseline) + movd %esi, %xmm4 + test %rdx, %rdx # empty buffer? + jz .L0 # if yes, return immediately + + punpcklbw %xmm4, %xmm4 # c -> cc + mov %edi, %ecx + punpcklwd %xmm4, %xmm4 # cc -> cccc + and $~0xf, %rdi # align source pointer + pshufd $0, %xmm4, %xmm4 # cccc -> cccccccccccccccc + and $0xf, %ecx + movdqa %xmm4, %xmm0 + mov $-1, %r8d + pcmpeqb (%rdi), %xmm0 # compare aligned head + shl %cl, %r8d # mask of bytes in the head of the buffer + pmovmskb %xmm0, %eax + + sub $16, %rcx + and %r8d, %eax # match mask + add %rcx, %rdx # advance past head + cmc + jbe .Lrunt # did the string end in the buffer? + + mov %rdi, %rsi # pointer to matching chunk + add $16, %rdi + sub $16, %rdx # enough left for another round? + jbe 1f + + /* main loop unrolled twice */ + ALIGN_TEXT +0: movdqa %xmm4, %xmm0 + pcmpeqb (%rdi), %xmm0 + pmovmskb %xmm0, %r8d + + cmp $16, %rdx # enough left for second chunk? + jbe 2f + + movdqa %xmm4, %xmm0 + pcmpeqb 16(%rdi), %xmm0 + pmovmskb %xmm0, %ecx + + lea 16(%rdi), %r9 + test %ecx, %ecx # match found in second chunk? + cmovz %r8d, %ecx # if not, use match data from first chunk + cmovz %rdi, %r9 + + test %ecx, %ecx # any match found? + cmovnz %ecx, %eax # if yes, overwrite previously found match + cmovnz %r9, %rsi + + add $32, %rdi # advance to next iteration + sub $32, %rdx # advance to next chunks + ja 0b + + /* process remaining 1--16 bytes */ +1: pcmpeqb (%rdi), %xmm4 + mov $0xffff, %r8d + xor %ecx, %ecx + sub %edx, %ecx # number of bytes to be masked out + pmovmskb %xmm4, %r9d + shr %cl, %r8d # mask of bytes to be kept in the buffer + and %r9d, %r8d + cmovnz %r8d, %eax + cmovnz %rdi, %rsi + bsr %eax, %eax + lea (%rsi, %rax, 1), %rsi # pointer to match (or junk) + cmovnz %rsi, %rax # if any match was found, return it + ret + + /* end of chunk reached within first half iteration */ +2: test %r8d, %r8d # match in previous chunk? + cmovnz %r8d, %eax # if yes, overwrite previous chunks + cmovnz %rdi, %rsi + add $16, %rdi # point to tail + sub $16, %edx + jmp 1b # handle tail the same otherwise + + /* runt: string ends within head, edx has negated amount of invalid head bytes */ +.Lrunt: mov $0xffff, %r8d + xor %ecx, %ecx + sub %edx, %ecx + shr %cl, %r8d + and %r8d, %eax + bsr %eax, %eax + lea (%rdi, %rax, 1), %rdi + cmovnz %rdi, %rax + ret + + /* empty buffer: return a null pointer */ +.L0: xor %eax, %eax + ret +ARCHEND(memrchr, baseline) + + .section .note.GNU-stack, "", %progbits -- 2.43.0 From 8d924ab1fed8dc15c41204997cb98889fc31feb8 Mon Sep 17 00:00:00 2001 From: Robert Clausecker Date: Wed, 6 Dec 2023 08:47:03 -0500 Subject: [PATCH 30/30] share/man/man7/simd.7: document SIMD-enhanced memrchr implementation --- share/man/man7/simd.7 | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/share/man/man7/simd.7 b/share/man/man7/simd.7 index 281898839c24..fd9485524aef 100644 --- a/share/man/man7/simd.7 +++ b/share/man/man7/simd.7 @@ -24,7 +24,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE . -.Dd December 4, 2023 +.Dd December 6, 2023 .Dt SIMD 7 .Os .Sh NAME @@ -63,6 +63,7 @@ Enhanced functions are present for the following architectures: .It memccpy Ta Ta Ta S1 .It memcpy Ta S Ta S Ta S Ta S Ta SV .It memmove Ta S Ta S Ta S Ta S Ta SV +.It memrchr Ta Ta Ta S1 .It memset Ta S Ta S Ta S Ta S .It rindex Ta S Ta Ta S1 Ta S .It stpcpy Ta S Ta Ta S1 -- 2.43.0