From owner-svn-src-all@freebsd.org Wed Mar 23 13:29:53 2016 Return-Path: Delivered-To: svn-src-all@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 42E84ADB722; Wed, 23 Mar 2016 13:29:53 +0000 (UTC) (envelope-from wma@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2610:1c1:1:6068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 1E4AB1CEC; Wed, 23 Mar 2016 13:29:53 +0000 (UTC) (envelope-from wma@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.37]) by repo.freebsd.org (8.15.2/8.15.2) with ESMTP id u2NDTqTE062091; Wed, 23 Mar 2016 13:29:52 GMT (envelope-from wma@FreeBSD.org) Received: (from wma@localhost) by repo.freebsd.org (8.15.2/8.15.2/Submit) id u2NDTq0b062090; Wed, 23 Mar 2016 13:29:52 GMT (envelope-from wma@FreeBSD.org) Message-Id: <201603231329.u2NDTq0b062090@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: wma set sender to wma@FreeBSD.org using -f From: Wojciech Macek Date: Wed, 23 Mar 2016 13:29:52 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r297209 - head/sys/arm64/arm64 X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.21 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 23 Mar 2016 13:29:53 -0000 Author: wma Date: Wed Mar 23 13:29:52 2016 New Revision: 297209 URL: https://svnweb.freebsd.org/changeset/base/297209 Log: ARM64 copyinout improvements The first of set of patches. Use wider load/stores when aligned buffer is being copied. In a simple test: dd if=/dev/zero of=/dev/null bs=1M count=1024 the performance jumped from 410MB/s up to 3.6GB/s. TODO: - better handling of unaligned buffers (WiP) - implement similar mechanism to bzero Submitted by: Dominik Ermel Obtained from: Semihalf Sponsored by: Cavium Reviewed by: kib, andrew, emaste Differential Revision: https://reviews.freebsd.org/D5664 Modified: head/sys/arm64/arm64/copyinout.S Modified: head/sys/arm64/arm64/copyinout.S ============================================================================== --- head/sys/arm64/arm64/copyinout.S Wed Mar 23 13:28:04 2016 (r297208) +++ head/sys/arm64/arm64/copyinout.S Wed Mar 23 13:29:52 2016 (r297209) @@ -51,24 +51,17 @@ END(copyio_fault) * int copyout(const void *kaddr, void *udaddr, size_t len) */ ENTRY(copyout) - cbz x2, 2f /* If len == 0 then skip loop */ + cbz x2, 1f add x3, x1, x2 ldr x4, =VM_MAXUSER_ADDRESS cmp x3, x4 b.hi copyio_fault_nopcb - adr x6, copyio_fault /* Get the handler address */ - SET_FAULT_HANDLER(x6, x7) /* Set the handler */ - -1: ldrb w4, [x0], #1 /* Load from kaddr */ - strb w4, [x1], #1 /* Store in uaddr */ - sub x2, x2, #1 /* len-- */ - cbnz x2, 1b - - SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */ + b copycommon -2: mov x0, xzr /* return 0 */ +1: mov x0, xzr /* return 0 */ ret + END(copyout) /* @@ -77,24 +70,17 @@ END(copyout) * int copyin(const void *uaddr, void *kdaddr, size_t len) */ ENTRY(copyin) - cbz x2, 2f /* If len == 0 then skip loop */ + cbz x2, 1f add x3, x0, x2 ldr x4, =VM_MAXUSER_ADDRESS cmp x3, x4 b.hi copyio_fault_nopcb - adr x6, copyio_fault /* Get the handler address */ - SET_FAULT_HANDLER(x6, x7) /* Set the handler */ - -1: ldrb w4, [x0], #1 /* Load from uaddr */ - strb w4, [x1], #1 /* Store in kaddr */ - sub x2, x2, #1 /* len-- */ - cbnz x2, 1b - - SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */ + b copycommon -2: mov x0, xzr /* return 0 */ +1: mov x0, xzr /* return 0 */ ret + END(copyin) /* @@ -130,3 +116,101 @@ ENTRY(copyinstr) csel w0, wzr, w1, eq /* If so return success, else failure */ ret END(copyinstr) + +/* + * Local helper + * + * x0 - src pointer + * x1 - dst pointer + * x2 - size + * lr - the return address, so jump here instead of calling + * + * This function is optimized to minimize concurrent memory accesses. In + * present form it is suited for cores with a single memory prefetching + * unit. + * ARM64TODO: + * Consider using separate functions for each ARM64 core. Adding memory + * access interleaving might increase a total throughput on A57 or A72. + */ + .text + .align 4 + .local copycommon + .type copycommon,@function + +copycommon: + adr x6, copyio_fault /* Get the handler address */ + SET_FAULT_HANDLER(x6, x7) /* Set the handler */ + + + /* Check alignment */ + orr x3, x0, x1 + ands x3, x3, 0x07 + b.eq aligned + + /* Unaligned is byte by byte copy */ +byte_by_byte: + ldrb w3, [x0], #0x01 + strb w3, [x1], #0x01 + subs x2, x2, #0x01 + b.ne byte_by_byte + b ending + +aligned: + cmp x2, #0x10 + b.lt lead_out + cmp x2, #0x40 + b.lt by_dwords_start + + /* Block copy */ + lsr x15, x2, #0x06 +by_blocks: + ldp x3, x4, [x0], #0x10 + ldp x5, x6, [x0], #0x10 + ldp x7, x8, [x0], #0x10 + ldp x9, x10, [x0], #0x10 + stp x3, x4, [x1], #0x10 + stp x5, x6, [x1], #0x10 + stp x7, x8, [x1], #0x10 + stp x9, x10, [x1], #0x10 + + subs x15, x15, #0x01 + b.ne by_blocks + + and x2, x2, #0x3f + +by_dwords_start: + lsr x15, x2, #0x04 + cbz x15, lead_out +by_dwords: + ldp x3, x4, [x0], #0x10 + stp x3, x4, [x1], #0x10 + subs x15, x15, #0x01 + b.ne by_dwords + + /* Less than 16 bytes to copy */ +lead_out: + tbz x2, #0x03, last_word + ldr x3, [x0], #0x08 + str x3, [x1], #0x08 + +last_word: + tbz x2, #0x02, last_hword + ldr w3, [x0], #0x04 + str w3, [x1], #0x04 + +last_hword: + tbz x2, #0x01, last_byte + ldrh w3, [x0], #0x02 + strh w3, [x1], #0x02 + +last_byte: + tbz x2, #0x00, ending + ldrb w3, [x0] + strb w3, [x1] + +ending: + SET_FAULT_HANDLER(xzr, x7) /* Clear the handler */ + + mov x0, xzr /* return 0 */ + ret + .size copycommon, . - copycommon