Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 2 Aug 2005 02:25:18 +0800
From:      Xin LI <delphij@frontfree.net>
To:        freebsd-arch@FreeBSD.org, freebsd-amd64@FreeBSD.org
Cc:        obrien@FreeBSD.org
Subject:   [RFC] Port of NetBSD's optimized amd64 string code
Message-ID:  <20050801182518.GA85423@frontfree.net>

next in thread | raw e-mail | index | archive | help

--EuxKj2iCbKjpUGkD
Content-Type: multipart/mixed; boundary="vtzGhvizbBRQ85DL"
Content-Disposition: inline


--vtzGhvizbBRQ85DL
Content-Type: text/plain; charset=us-ascii
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable

Hi, Guys,

Here is a patchset that I have produced to make our libc aware of the
NetBSD assembly implementation of the string related operations.

Cheers,
--=20
Xin LI <delphij frontfree net>	http://www.delphij.net/
See complete headers for GPG key and other information.


--vtzGhvizbBRQ85DL
Content-Type: text/plain; charset=us-ascii
Content-Disposition: attachment; filename="patch-libc::amd64-string"
Content-Transfer-Encoding: quoted-printable

Index: Makefile.inc
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: /home/ncvs/src/lib/libc/amd64/string/Makefile.inc,v
retrieving revision 1.5
diff -u -r1.5 Makefile.inc
--- Makefile.inc	10 Apr 2005 18:58:49 -0000	1.5
+++ Makefile.inc	1 Aug 2005 18:18:29 -0000
@@ -1,4 +1,5 @@
 # $FreeBSD: src/lib/libc/amd64/string/Makefile.inc,v 1.5 2005/04/10 18:58:=
49 alc Exp $
=20
-MDSRCS+=3D bcmp.S bcopy.S bzero.S memcmp.S memcpy.S memmove.S memset.S \
-	strcat.S strcmp.S strcpy.S
+MDSRCS+=3D bcmp.S bcopy.S bzero.S ffs.S index.S memchr.S memcmp.S memcpy.S=
 \
+	memmove.S memset.S rindex.S strcat.S strchr.S strcmp.S strcpy.S  \
+	strlen.S strncmp.S strrchr.S swab.S
Index: ffs.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: ffs.S
diff -N ffs.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ ffs.S	1 Aug 2005 17:54:04 -0000
@@ -0,0 +1,22 @@
+/*
+ * Written by J.T. Conklin <jtc@NetBSD.org>.
+ * Public domain.
+ * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.c=
om>
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#if 0
+	RCSID("$NetBSD: ffs.S,v 1.2 2003/07/26 19:24:38 salo Exp $")
+#endif
+
+ENTRY(ffs)
+	bsfl	%edi,%eax
+	jz	L1	 		/* ZF is set if all bits are 0 */
+	incl	%eax			/* bits numbered from 1, not 0 */
+	ret
+
+	.align 4
+L1:	xorl	%eax,%eax		/* clear result */
+	ret
Index: index.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: index.S
diff -N index.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ index.S	1 Aug 2005 18:08:21 -0000
@@ -0,0 +1,5 @@
+/* $NetBSD: index.S,v 1.3 2004/07/19 20:04:41 drochner Exp $ */
+/* $FreeBSD$ */
+
+#define INDEX
+#include "strchr.S"
Index: memchr.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: memchr.S
diff -N memchr.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ memchr.S	1 Aug 2005 18:09:44 -0000
@@ -0,0 +1,112 @@
+/*
+ * Written by J.T. Conklin <jtc@acorntoolworks.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#if 0
+	RCSID("$NetBSD: memchr.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
+#endif
+
+ENTRY(memchr)
+	movzbq	%sil,%rcx
+
+	/*
+	 * Align to word boundry
+	 * Consider unrolling loop?
+	 */
+	testq	%rdx,%rdx	/* nbytes =3D=3D 0? */
+	je	.Lzero
+.Lalign:
+	testb	$7,%dil
+	je	.Lword_aligned
+	movq	%rdi,%rax
+	cmpb	(%rdi),%cl
+	je	.Ldone
+	incq	%rdi
+	decq	%rdx
+	jnz	.Lalign
+	jmp	.Lzero
+
+.Lword_aligned:
+	/* copy char to all bytes in word */
+	movb	%cl,%ch
+	movq	%rcx,%rsi
+	salq	$16,%rcx
+	orq	%rsi,%rcx
+	movq	%rcx,%rsi
+	salq	$32,%rcx
+	orq	%rsi,%rcx
+
+	movabsq	$0x0101010101010101,%r8
+	movabsq	$0x8080808080808080,%r9
+
+	.align 4
+.Lloop:
+	cmpq	$7,%rdx		/* nbytes > 8 */
+	jbe	.Lbyte
+	movq	(%rdi),%rsi
+	addq	$8,%rdi
+	xorq	%rcx,%rsi
+	subq	$8,%rdx
+	subq	%r8,%rsi
+	testq	%r9,%rsi
+	je	.Lloop
+
+	/*
+	 * In rare cases, the above loop may exit prematurely. We must
+	 * return to the loop if none of the bytes in the word are
+	 * equal to ch.
+	 */
+
+	leaq	-8(%rdi),%rax
+	cmpb	-8(%rdi),%cl	/* 1st byte =3D=3D ch? */
+	je	.Ldone
+
+	leaq	-7(%rdi),%rax
+	cmpb	-7(%rdi),%cl	/* 2nd byte =3D=3D ch? */
+	je	.Ldone
+
+	leaq	-6(%rdi),%rax
+	cmpb	-6(%rdi),%cl	/* 3rd byte =3D=3D ch? */
+	je	.Ldone
+
+	leaq	-5(%rdi),%rax
+	cmpb	-5(%rdi),%cl	/* 4th byte =3D=3D ch? */
+	je	.Ldone
+
+	leaq	-4(%rdi),%rax
+	cmpb	-4(%rdi),%cl	/* 5th byte =3D=3D ch? */
+	je	.Ldone
+
+	leaq	-3(%rdi),%rax
+	cmpb	-3(%rdi),%cl	/* 6th byte =3D=3D ch? */
+	je	.Ldone
+
+	leaq	-2(%rdi),%rax
+	cmpb	-2(%rdi),%cl	/* 7th byte =3D=3D ch? */
+	je	.Ldone
+
+	leaq	-1(%rdi),%rax
+	cmpb	-1(%rdi),%cl	/* 7th byte =3D=3D ch? */
+	jne	.Lloop
+	ret
+
+.Lbyte:
+	testq	%rdx,%rdx
+	je	.Lzero
+.Lbyte_loop:
+	movq	%rdi,%rax
+	cmpb	(%rdi),%cl
+	je	.Ldone
+	incq	%rdi
+	decq	%rdx
+	jnz	.Lbyte_loop
+
+.Lzero:
+	xorq	%rax,%rax
+
+.Ldone:
+	ret
Index: rindex.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: rindex.S
diff -N rindex.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ rindex.S	1 Aug 2005 18:10:36 -0000
@@ -0,0 +1,5 @@
+/* $NetBSD: rindex.S,v 1.3 2004/07/19 20:04:41 drochner Exp $ */
+/* $FreeBSD$ */
+
+#define RINDEX
+#include "strrchr.S"
Index: strchr.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: strchr.S
diff -N strchr.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strchr.S	1 Aug 2005 18:11:51 -0000
@@ -0,0 +1,137 @@
+/*
+ * Written by J.T. Conklin <jtc@acorntoolworks.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#if 0
+	RCSID("$NetBSD: strchr.S,v 1.2 2004/07/19 20:04:41 drochner Exp $")
+#endif
+
+#ifdef INDEX
+ENTRY(index)
+#else
+ENTRY(strchr)
+#endif
+	movzbq	%sil,%rcx
+
+	/*
+	 * Align to word boundary.
+	 * Consider unrolling loop?
+	 */
+.Lalign:
+	testb	$7,%dil
+	je	.Lword_aligned
+	movb	(%rdi),%dl
+	cmpb	%cl,%dl
+	je	.Ldone
+	incq	%rdi
+	testb	%dl,%dl
+	jne	.Lalign
+	jmp	.Lzero
+
+.Lword_aligned:
+	/* copy char to all bytes in word */
+	movb	%cl,%ch
+	movq	%rcx,%rdx
+	salq	$16,%rcx
+	orq	%rdx,%rcx
+	movq	%rcx,%rdx
+	salq	$32,%rcx
+	orq	%rdx,%rcx
+
+	movabsq	$0x0101010101010101,%r8
+	movabsq	$0x8080808080808080,%r9
+
+	/* Check whether any byte in the word is equal to ch or 0. */
+	.align 4
+.Lloop:
+	movq	(%rdi),%rdx
+	addq	$8,%rdi
+	movq	%rdx,%rsi
+	subq	%r8,%rdx
+	xorq	%rcx,%rsi
+	subq	%r8,%rsi
+	orq	%rsi,%rdx
+	testq	%r9,%rdx
+	je	.Lloop
+
+	/*
+	 * In rare cases, the above loop may exit prematurely. We must
+	 * return to the loop if none of the bytes in the word match
+	 * ch or are equal to 0.
+	 */
+
+	movb	-8(%rdi),%dl
+	cmpb	%cl,%dl		/* 1st byte =3D=3D ch? */
+	jne	1f
+	subq	$8,%rdi
+	jmp	.Ldone
+1:	testb	%dl,%dl		/* 1st byte =3D=3D 0? */
+	je	.Lzero
+
+	movb	-7(%rdi),%dl
+	cmpb	%cl,%dl		/* 2nd byte =3D=3D ch? */
+	jne	1f
+	subq	$7,%rdi
+	jmp	.Ldone
+1:	testb	%dl,%dl		/* 2nd byte =3D=3D 0? */
+	je	.Lzero
+
+	movb	-6(%rdi),%dl
+	cmpb	%cl,%dl		/* 3rd byte =3D=3D ch? */
+	jne	1f
+	subq	$6,%rdi
+	jmp	.Ldone
+1:	testb	%dl,%dl		/* 3rd byte =3D=3D 0? */
+	je	.Lzero
+
+	movb	-5(%rdi),%dl
+	cmpb	%cl,%dl		/* 4th byte =3D=3D ch? */
+	jne	1f
+	subq	$5,%rdi
+	jmp	.Ldone
+1:	testb	%dl,%dl		/* 4th byte =3D=3D 0? */
+	je	.Lzero
+
+	movb	-4(%rdi),%dl
+	cmpb	%cl,%dl		/* 5th byte =3D=3D ch? */
+	jne	1f
+	subq	$4,%rdi
+	jmp	.Ldone
+1:	testb	%dl,%dl		/* 5th byte =3D=3D 0? */
+	je	.Lzero
+
+	movb	-3(%rdi),%dl
+	cmpb	%cl,%dl		/* 6th byte =3D=3D ch? */
+	jne	1f
+	subq	$3,%rdi
+	jmp	.Ldone
+1:	testb	%dl,%dl		/* 6th byte =3D=3D 0? */
+	je	.Lzero
+
+	movb	-2(%rdi),%dl
+	cmpb	%cl,%dl		/* 7th byte =3D=3D ch? */
+	jne	1f
+	subq	$2,%rdi
+	jmp	.Ldone
+1:	testb	%dl,%dl		/* 7th byte =3D=3D 0? */
+	je	.Lzero
+
+	movb	-1(%rdi),%dl
+	cmpb	%cl,%dl		/* 8th byte =3D=3D ch? */
+	jne	1f
+	subq	$1,%rdi
+	jmp	.Ldone
+1:	testb	%dl,%dl		/* 8th byte =3D=3D 0? */
+	jne	.Lloop
+
+.Lzero:
+	/* If a ch wasn't found, return 0. */
+	xorq	%rdi,%rdi
+
+.Ldone:
+	movq	%rdi,%rax
+	ret
Index: strlen.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: strlen.S
diff -N strlen.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strlen.S	1 Aug 2005 18:12:48 -0000
@@ -0,0 +1,157 @@
+/*
+ * Written by J.T. Conklin <jtc@acorntoolworks.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#if 0
+	RCSID("$NetBSD: strlen.S,v 1.3 2004/07/19 20:04:41 drochner Exp $")
+#endif
+
+ENTRY(strlen)
+	movq	%rdi,%rax
+	negq	%rdi
+
+.Lalign:
+	/* Consider unrolling loop? */
+	testb	$7,%al
+	je	.Lword_aligned
+	cmpb	$0,(%rax)
+	jne	1f
+	leaq	(%rdi,%rax),%rax
+	ret
+1:	incq	%rax
+	jmp	.Lalign
+
+	/*
+	 * There are many well known branch-free sequences which are used
+	 * for determining whether a zero-byte is contained within a word.
+	 * These sequences are generally much more efficent than loading
+	 * and comparing each byte individually.
+	 *
+	 * The expression [1,2]:
+	 *
+	 * (1)  ~(((x & 0x7f....7f) + 0x7f....7f) | (x | 0x7f....7f))
+	 *
+	 * evaluates to a non-zero value if any of the bytes in the
+	 * original word is zero.
+	 *
+	 * It also has the useful property that bytes in the result word
+	 * that coorespond to non-zero bytes in the original word have
+	 * the value 0x00, while bytes cooresponding to zero bytes have
+	 * the value 0x80. This allows calculation of the first (and
+	 * last) occurance of a zero byte within the word (useful for C's
+	 * str* primitives) by counting the number of leading (or
+	 * trailing) zeros and dividing the result by 8.  On machines
+	 * without (or with slow) clz() / ctz() instructions, testing
+	 * each byte in the result word for zero is necessary.
+	 *
+	 * This typically takes 4 instructions (5 on machines without
+	 * "not-or") not including those needed to load the constant.
+	 *
+	 *
+	 * The expression:
+	 *
+	 * (2)  ((x - 0x01....01) & ~x & 0x80....80)
+	 *
+	 * evaluates to a non-zero value if any of the bytes in the
+	 * original word is zero.
+	 *
+	 * On little endian machines, the first byte in the result word
+	 * that cooresponds to a zero byte in the original byte is 0x80,
+	 * so clz() can be used as above.  On big endian machines, and
+	 * little endian machines without (or with a slow) clz() insn,
+	 * testing each byte in the original for zero is necessary
+	 *
+	 * This typically takes 3 instructions (4 on machines without
+	 * "and with complement") not including those needed to load
+	 * constants.
+	 *
+	 *
+	 * The expression:
+	 *
+	 * (3)  ((x - 0x01....01) & 0x80....80)
+	 *
+	 * always evaluates to a non-zero value if any of the bytes in
+	 * the original word is zero.  However, in rare cases, it also
+	 * evaluates to a non-zero value when none of the bytes in the
+	 * original word is zero.
+	 *
+	 * To account for possible false positives, each byte of the
+	 * original word must be checked when the expression evaluates to
+	 * a non-zero value.  However, because it is simpler than those
+	 * presented above, code that uses it will be faster as long as
+	 * the rate of false positives is low.
+	 *
+	 * This is likely, because the the false positive can only occur
+	 * if the most siginificant bit of a byte within the word is set.
+	 * The expression will never fail for typical 7-bit ASCII strings.
+	 *
+	 * This typically takes 2 instructions not including those needed
+	 * to load constants.
+	 *
+	 *
+	 * [1] Henry S. Warren Jr., "Hacker's Delight", Addison-Westley 2003
+	 *
+	 * [2] International Business Machines, "The PowerPC Compiler Writer's
+	 *     Guide", Warthman Associates, 1996
+	 */
+
+	.align 4
+.Lword_aligned:
+	movabsq	$0x0101010101010101,%r8
+	movabsq	$0x8080808080808080,%r9
+.Lloop:
+	movq	(%rax),%rdx
+	addq	$8,%rax
+	subq	%r8,%rdx
+	testq	%r9,%rdx
+	je	.Lloop
+
+	/*
+	 * In rare cases, the above loop may exit prematurely. We must
+	 * return to the loop if none of the bytes in the word equal 0.
+	 */
+	cmpb	$0,-8(%rax)		/* 1st byte =3D=3D 0? */
+	je	.Lsub8
+	cmpb	$0,-7(%rax)		/* 2nd byte =3D=3D 0? */
+	je	.Lsub7
+	cmpb	$0,-6(%rax)		/* 3rd byte =3D=3D 0? */
+	je	.Lsub6
+	cmpb	$0,-5(%rax)		/* 4th byte =3D=3D 0? */
+	je	.Lsub5
+	cmpb	$0,-4(%rax)		/* 5th byte =3D=3D 0? */
+	je	.Lsub4
+	cmpb	$0,-3(%rax)		/* 6th byte =3D=3D 0? */
+	je	.Lsub3
+	cmpb	$0,-2(%rax)		/* 7th byte =3D=3D 0? */
+	je	.Lsub2
+	cmpb	$0,-1(%rax)		/* 8th byte =3D=3D 0? */
+	jne	.Lloop
+
+.Lsub1:
+	leaq	-1(%rdi,%rax),%rax
+	ret
+.Lsub2:
+	leaq	-2(%rdi,%rax),%rax
+	ret
+.Lsub3:
+	leaq	-3(%rdi,%rax),%rax
+	ret
+.Lsub4:
+	leaq	-4(%rdi,%rax),%rax
+	ret
+.Lsub5:
+	leaq	-5(%rdi,%rax),%rax
+	ret
+.Lsub6:
+	leaq	-6(%rdi,%rax),%rax
+	ret
+.Lsub7:
+	leaq	-7(%rdi,%rax),%rax
+	ret
+.Lsub8:
+	leaq	-8(%rdi,%rax),%rax
+	ret
Index: strncmp.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: strncmp.S
diff -N strncmp.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strncmp.S	1 Aug 2005 18:13:51 -0000
@@ -0,0 +1,108 @@
+/*
+ * Written by J.T. Conklin <jtc@NetBSD.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#if 0
+        RCSID("$NetBSD: strncmp.S,v 1.2 2003/07/26 19:24:40 salo Exp $")
+#endif
+
+/*
+ * NOTE: I've unrolled the loop eight times: large enough to make a
+ * significant difference, and small enough not to totally trash the
+ * cache.
+ */
+
+ENTRY(strncmp)
+	testq	%rdx,%rdx
+	jmp	L2			/* Jump into the loop! */
+
+L1:	incq	%rdi
+	incq	%rsi
+	decq	%rdx
+L2:	jz	L4			/* strings are equal */
+	movb	(%rdi),%al
+	testb	%al,%al
+	jz	L3
+	cmpb	%al,(%rsi)
+	jne	L3
+
+	incq	%rdi
+	incq	%rsi
+	decq	%rdx
+	jz	L4
+	movb	(%rdi),%al
+	testb	%al,%al
+	jz	L3
+	cmpb	%al,(%rsi)
+	jne	L3
+
+	incq	%rdi
+	incq	%rsi
+	decq	%rdx
+	jz	L4
+	movb	(%rdi),%al
+	testb	%al,%al
+	jz	L3
+	cmpb	%al,(%rsi)
+	jne	L3
+
+	incq	%rdi
+	incq	%rsi
+	decq	%rdx
+	jz	L4
+	movb	(%rdi),%al
+	testb	%al,%al
+	jz	L3
+	cmpb	%al,(%rsi)
+	jne	L3
+
+	incq	%rdi
+	incq	%rsi
+	decq	%rdx
+	jz	L4
+	movb	(%rdi),%al
+	testb	%al,%al
+	jz	L3
+	cmpb	%al,(%rsi)
+	jne	L3
+
+	incq	%rdi
+	incq	%rsi
+	decq	%rdx
+	jz	L4
+	movb	(%rdi),%al
+	testb	%al,%al
+	jz	L3
+	cmpb	%al,(%rsi)
+	jne	L3
+
+	incq	%rdi
+	incq	%rsi
+	decq	%rdx
+	jz	L4
+	movb	(%rdi),%al
+	testb	%al,%al
+	jz	L3
+	cmpb	%al,(%rsi)
+	jne	L3
+
+	incq	%rdi
+	incq	%rsi
+	decq	%rdx
+	jz	L4
+	movb	(%rdi),%al
+	testb	%al,%al
+	jz	L3
+	cmpb	%al,(%rsi)
+	je	L1
+
+L3:	movzbl	(%rdi),%eax		/* unsigned comparision */
+	movzbl	(%rsi),%ecx
+	subl	%ecx,%eax
+	ret
+L4:	xorl	%eax,%eax
+	ret
Index: strrchr.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: strrchr.S
diff -N strrchr.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ strrchr.S	1 Aug 2005 18:15:07 -0000
@@ -0,0 +1,127 @@
+/*
+ * Written by J.T. Conklin <jtc@acorntoolworks.com>
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#if 0
+	RCSID("$NetBSD: strrchr.S,v 1.2 2004/07/19 20:04:41 drochner Exp $")
+#endif
+
+#ifdef RINDEX
+ENTRY(rindex)
+#else
+ENTRY(strrchr)
+#endif
+	movzbq	%sil,%rcx
+
+	/* zero return value */
+	xorq	%rax,%rax
+
+	/*
+	 * Align to word boundary.
+	 * Consider unrolling loop?
+	 */
+.Lalign:
+	testb	$7,%dil
+	je	.Lword_aligned
+	movb	(%rdi),%dl
+	cmpb	%cl,%dl
+	cmoveq	%rdi,%rax
+	incq	%rdi
+	testb	%dl,%dl
+	jne	.Lalign
+	jmp	.Ldone
+
+.Lword_aligned:
+	/* copy char to all bytes in word */
+	movb	%cl,%ch
+	movq	%rcx,%rdx
+	salq	$16,%rcx
+	orq	%rdx,%rcx
+	movq	%rcx,%rdx
+	salq	$32,%rcx
+	orq	%rdx,%rcx
+
+	movabsq	$0x0101010101010101,%r8
+	movabsq	$0x8080808080808080,%r9
+
+	/* Check whether any byte in the word is equal to ch or 0. */
+	.align 4
+.Lloop:
+	movq	(%rdi),%rdx
+	addq	$8,%rdi
+	movq	%rdx,%rsi
+	subq	%r8,%rdx
+	xorq	%rcx,%rsi
+	subq	%r8,%rsi
+	orq	%rsi,%rdx
+	testq	%r9,%rdx
+	je	.Lloop
+
+	/*
+	 * In rare cases, the above loop may exit prematurely. We must
+	 * return to the loop if none of the bytes in the word match
+	 * ch or are equal to 0.
+	 */
+
+	movb	-8(%rdi),%dl
+	cmpb	%cl,%dl		/* 1st byte =3D=3D ch? */
+	jne	1f
+	leaq	-8(%rdi),%rax
+1:	testb	%dl,%dl		/* 1st byte =3D=3D 0? */
+	je	.Ldone
+
+	movb	-7(%rdi),%dl
+	cmpb	%cl,%dl		/* 2nd byte =3D=3D ch? */
+	jne	1f
+	leaq	-7(%rdi),%rax
+1:	testb	%dl,%dl		/* 2nd byte =3D=3D 0? */
+	je	.Ldone
+
+	movb	-6(%rdi),%dl
+	cmpb	%cl,%dl		/* 3rd byte =3D=3D ch? */
+	jne	1f
+	leaq	-6(%rdi),%rax
+1:	testb	%dl,%dl		/* 3rd byte =3D=3D 0? */
+	je	.Ldone
+
+	movb	-5(%rdi),%dl
+	cmpb	%cl,%dl		/* 4th byte =3D=3D ch? */
+	jne	1f
+	leaq	-5(%rdi),%rax
+1:	testb	%dl,%dl		/* 4th byte =3D=3D 0? */
+	je	.Ldone
+
+	movb	-4(%rdi),%dl
+	cmpb	%cl,%dl		/* 5th byte =3D=3D ch? */
+	jne	1f
+	leaq	-4(%rdi),%rax
+1:	testb	%dl,%dl		/* 5th byte =3D=3D 0? */
+	je	.Ldone
+
+	movb	-3(%rdi),%dl
+	cmpb	%cl,%dl		/* 6th byte =3D=3D ch? */
+	jne	1f
+	leaq	-3(%rdi),%rax
+1:	testb	%dl,%dl		/* 6th byte =3D=3D 0? */
+	je	.Ldone
+
+	movb	-2(%rdi),%dl
+	cmpb	%cl,%dl		/* 7th byte =3D=3D ch? */
+	jne	1f
+	leaq	-2(%rdi),%rax
+1:	testb	%dl,%dl		/* 7th byte =3D=3D 0? */
+	je	.Ldone
+
+	movb	-1(%rdi),%dl
+	cmpb	%cl,%dl		/* 8th byte =3D=3D ch? */
+	jne	1f
+	leaq	-1(%rdi),%rax
+1:	testb	%dl,%dl		/* 8th byte =3D=3D 0? */
+	jne	.Lloop
+
+.Ldone:
+	ret
Index: swab.S
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D
RCS file: swab.S
diff -N swab.S
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ swab.S	1 Aug 2005 18:18:17 -0000
@@ -0,0 +1,47 @@
+/*
+ * Written by J.T. Conklin <jtc@NetBSD.org>.
+ * Public domain.
+ */
+
+#include <machine/asm.h>
+__FBSDID("$FreeBSD$");
+
+#if 0
+	RCSID("$NetBSD: swab.S,v 1.2 2003/07/26 19:24:40 salo Exp $")
+#endif
+
+#define LOAD_SWAP_STORE_WORD \
+	lodsw	; \
+	xchgb	%al,%ah ; \
+	stosw
+
+ENTRY(swab)
+	xchgq	%rdi,%rsi
+	cld				# set direction forward
+
+	shrq	$1,%rdx
+	testq	$7,%rdx			# copy first group of 1 to 7 words
+	jz	L2			# while swaping alternate bytes.
+L1:	lodsw
+	rorw	$8,%ax
+	stosw
+	decq	%rdx
+	testq	$7,%rdx
+	jnz	L1
+
+L2:	shrq	$3,%rdx			# copy remainder 8 words at a time
+	jz	L4			# while swapping alternate bytes.
+L3:
+	LOAD_SWAP_STORE_WORD
+	LOAD_SWAP_STORE_WORD
+	LOAD_SWAP_STORE_WORD
+	LOAD_SWAP_STORE_WORD
+	LOAD_SWAP_STORE_WORD
+	LOAD_SWAP_STORE_WORD
+	LOAD_SWAP_STORE_WORD
+	LOAD_SWAP_STORE_WORD
+
+	decq	%rdx
+	jnz	L3
+L4:
+	ret

--vtzGhvizbBRQ85DL--

--EuxKj2iCbKjpUGkD
Content-Type: application/pgp-signature
Content-Disposition: inline

-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.4.1 (FreeBSD)

iD8DBQFC7mkO/cVsHxFZiIoRArcuAJ9AF9F0+YFYsQpLVPvnd3hGmKNXBgCdFAIS
mrMJ3TeaXKrzkBqS3vxeQGI=
=TJe9
-----END PGP SIGNATURE-----

--EuxKj2iCbKjpUGkD--



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20050801182518.GA85423>