From owner-svn-src-all@freebsd.org  Wed Mar 23 13:29:53 2016
Return-Path: <owner-svn-src-all@freebsd.org>
Delivered-To: svn-src-all@mailman.ysv.freebsd.org
Received: from mx1.freebsd.org (mx1.freebsd.org
 [IPv6:2001:1900:2254:206a::19:1])
 by mailman.ysv.freebsd.org (Postfix) with ESMTP id 42E84ADB722;
 Wed, 23 Mar 2016 13:29:53 +0000 (UTC) (envelope-from wma@FreeBSD.org)
Received: from repo.freebsd.org (repo.freebsd.org
 [IPv6:2610:1c1:1:6068::e6a:0])
 (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
 (Client did not present a certificate)
 by mx1.freebsd.org (Postfix) with ESMTPS id 1E4AB1CEC;
 Wed, 23 Mar 2016 13:29:53 +0000 (UTC) (envelope-from wma@FreeBSD.org)
Received: from repo.freebsd.org ([127.0.1.37])
 by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id u2NDTqTE062091;
 Wed, 23 Mar 2016 13:29:52 GMT (envelope-from wma@FreeBSD.org)
Received: (from wma@localhost)
 by repo.freebsd.org (8.15.2/8.15.2/Submit) id u2NDTq0b062090;
 Wed, 23 Mar 2016 13:29:52 GMT (envelope-from wma@FreeBSD.org)
Message-Id: <201603231329.u2NDTq0b062090@repo.freebsd.org>
X-Authentication-Warning: repo.freebsd.org: wma set sender to wma@FreeBSD.org
 using -f
From: Wojciech Macek <wma@FreeBSD.org>
Date: Wed, 23 Mar 2016 13:29:52 +0000 (UTC)
To: src-committers@freebsd.org, svn-src-all@freebsd.org,
 svn-src-head@freebsd.org
Subject: svn commit: r297209 - head/sys/arm64/arm64
X-SVN-Group: head
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
X-BeenThere: svn-src-all@freebsd.org
X-Mailman-Version: 2.1.21
Precedence: list
List-Id: "SVN commit messages for the entire src tree \(except for &quot;
 user&quot; and &quot; projects&quot; \)" <svn-src-all.freebsd.org>
List-Unsubscribe: <https://lists.freebsd.org/mailman/options/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=unsubscribe>
List-Archive: <http://lists.freebsd.org/pipermail/svn-src-all/>
List-Post: <mailto:svn-src-all@freebsd.org>
List-Help: <mailto:svn-src-all-request@freebsd.org?subject=help>
List-Subscribe: <https://lists.freebsd.org/mailman/listinfo/svn-src-all>,
 <mailto:svn-src-all-request@freebsd.org?subject=subscribe>
X-List-Received-Date: Wed, 23 Mar 2016 13:29:53 -0000

Author: wma
Date: Wed Mar 23 13:29:52 2016
New Revision: 297209
URL: https://svnweb.freebsd.org/changeset/base/297209

Log:
  ARM64 copyinout improvements
  
  The first of set of patches.
  Use wider load/stores when aligned buffer is being copied.
  
  In a simple test:
    dd if=/dev/zero of=/dev/null bs=1M count=1024
  the performance jumped from 410MB/s up to 3.6GB/s.
  
  TODO:
   - better handling of unaligned buffers (WiP)
   - implement similar mechanism to bzero
  
  Submitted by:          Dominik Ermel <der@semihalf.com>
  Obtained from:         Semihalf
  Sponsored by:          Cavium
  Reviewed by:           kib, andrew, emaste
  Differential Revision: https://reviews.freebsd.org/D5664

Modified:
  head/sys/arm64/arm64/copyinout.S

Modified: head/sys/arm64/arm64/copyinout.S
==============================================================================
--- head/sys/arm64/arm64/copyinout.S	Wed Mar 23 13:28:04 2016	(r297208)
+++ head/sys/arm64/arm64/copyinout.S	Wed Mar 23 13:29:52 2016	(r297209)
@@ -51,24 +51,17 @@ END(copyio_fault)
  * int copyout(const void *kaddr, void *udaddr, size_t len)
  */
 ENTRY(copyout)
-	cbz	x2, 2f		/* If len == 0 then skip loop */
+	cbz	x2, 1f
 	add	x3, x1, x2
 	ldr	x4, =VM_MAXUSER_ADDRESS
 	cmp	x3, x4
 	b.hi	copyio_fault_nopcb
 
-	adr	x6, copyio_fault /* Get the handler address */
-	SET_FAULT_HANDLER(x6, x7) /* Set the handler */
-
-1:	ldrb	w4, [x0], #1	/* Load from kaddr */
-	strb	w4, [x1], #1	/* Store in uaddr */
-	sub	x2, x2, #1	/* len-- */
-	cbnz	x2, 1b
-
-	SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */
+	b	copycommon
 
-2:	mov	x0, xzr		/* return 0 */
+1:	mov	x0, xzr		/* return 0 */
 	ret
+
 END(copyout)
 
 /*
@@ -77,24 +70,17 @@ END(copyout)
  * int copyin(const void *uaddr, void *kdaddr, size_t len)
  */
 ENTRY(copyin)
-	cbz	x2, 2f		/* If len == 0 then skip loop */
+	cbz	x2, 1f
 	add	x3, x0, x2
 	ldr	x4, =VM_MAXUSER_ADDRESS
 	cmp	x3, x4
 	b.hi	copyio_fault_nopcb
 
-	adr	x6, copyio_fault /* Get the handler address */
-	SET_FAULT_HANDLER(x6, x7) /* Set the handler */
-
-1:	ldrb	w4, [x0], #1	/* Load from uaddr */
-	strb	w4, [x1], #1	/* Store in kaddr */
-	sub	x2, x2, #1	/* len-- */
-	cbnz	x2, 1b
-
-	SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */
+	b	copycommon
 
-2:	mov	x0, xzr		/* return 0 */
+1:	mov	x0, xzr		/* return 0 */
 	ret
+
 END(copyin)
 
 /*
@@ -130,3 +116,101 @@ ENTRY(copyinstr)
 	csel	w0, wzr, w1, eq	/* If so return success, else failure */
 	ret
 END(copyinstr)
+
+/*
+ * Local helper
+ *
+ * x0 - src pointer
+ * x1 - dst pointer
+ * x2 - size
+ * lr - the return address, so jump here instead of calling
+ *
+ * This function is optimized to minimize concurrent memory accesses. In
+ * present form it is suited for cores with a single memory prefetching
+ * unit.
+ * ARM64TODO: 
+ *   Consider using separate functions for each ARM64 core. Adding memory
+ *   access interleaving might increase a total throughput on A57 or A72.
+ */
+	.text
+	.align	4
+	.local	copycommon
+	.type	copycommon,@function
+
+copycommon:
+	adr	x6, copyio_fault /* Get the handler address */
+	SET_FAULT_HANDLER(x6, x7) /* Set the handler */
+
+
+	/* Check alignment */
+	orr	x3, x0, x1
+	ands	x3, x3, 0x07
+	b.eq	aligned
+
+	/* Unaligned is byte by byte copy */
+byte_by_byte:
+	ldrb	w3, [x0], #0x01
+	strb	w3, [x1], #0x01
+	subs	x2, x2, #0x01
+	b.ne	byte_by_byte
+	b	ending
+
+aligned:
+	cmp	x2, #0x10
+	b.lt	lead_out
+	cmp	x2, #0x40
+	b.lt	by_dwords_start
+
+	/* Block copy */
+	lsr	x15, x2, #0x06
+by_blocks:
+	ldp	x3, x4, [x0], #0x10
+	ldp	x5, x6, [x0], #0x10
+	ldp	x7, x8, [x0], #0x10
+	ldp	x9, x10, [x0], #0x10
+	stp	x3, x4, [x1], #0x10
+	stp	x5, x6, [x1], #0x10
+	stp	x7, x8, [x1], #0x10
+	stp	x9, x10, [x1], #0x10
+
+	subs	x15, x15, #0x01
+	b.ne	by_blocks
+
+	and	x2, x2, #0x3f
+
+by_dwords_start:
+	lsr	x15, x2, #0x04
+	cbz	x15, lead_out
+by_dwords:
+	ldp	x3, x4, [x0], #0x10
+	stp	x3, x4, [x1], #0x10
+	subs	x15, x15, #0x01
+	b.ne  	by_dwords
+
+	/* Less than 16 bytes to copy */
+lead_out:
+	tbz	x2, #0x03, last_word
+	ldr	x3, [x0], #0x08
+	str	x3, [x1], #0x08
+
+last_word:
+	tbz	x2, #0x02, last_hword
+	ldr	w3, [x0], #0x04
+	str	w3, [x1], #0x04
+
+last_hword:
+	tbz	x2, #0x01, last_byte
+	ldrh	w3, [x0], #0x02
+	strh	w3, [x1], #0x02
+
+last_byte:
+	tbz	x2, #0x00, ending
+	ldrb	w3, [x0]
+	strb	w3, [x1]
+
+ending:
+	SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */
+
+	mov	x0, xzr		/* return 0 */
+	ret
+	.size	copycommon, . - copycommon