Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 26 Jul 2012 18:13:50 GMT
From:      Shane Nievera <nievera@mm.st>
To:        freebsd-gnats-submit@FreeBSD.org
Subject:   kern/170200: [patch] AES-NI XTS mode performance lower than CBC
Message-ID:  <201207261813.q6QIDo7R040201@red.freebsd.org>
Resent-Message-ID: <201207261820.q6QIK88I081289@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help

>Number:         170200
>Category:       kern
>Synopsis:       [patch] AES-NI XTS mode performance lower than CBC
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Thu Jul 26 18:20:07 UTC 2012
>Closed-Date:
>Last-Modified:
>Originator:     Shane Nievera
>Release:        10.0-CURRENT
>Organization:
None
>Environment:
FreeBSD taxeater.zerocs.ca 10.0-CURRENT FreeBSD 10.0-CURRENT #0 r238772M: Wed Jul 25 18:18:50 UTC 2012     root@taxeater.zerocs.ca:/usr/obj/usr/src/sys/GENERIC  amd64

>Description:
Performance of XTS mode using aesni.ko driver is lower than expected.  AES-XTS mode is often slower than AES-CBC.

>How-To-Repeat:
Benchmark using tools/tools/cryptotest, or perform large file operations on a GEOM_ELI filesystem.

>Fix:
A patch is attached that makes the aesni driver call the assembly routines in
crypto/openssl.  This at least doubles the current throughput.  The attached
benchmark output is for an i5-2500, 10.0-current, Clang system with debugging
options on.  Appplying the patch to stable with gcc results in 16Gbps throughput
without maxing out the CPU.  Perl will be required to generate the assembly
instructions from the scripts in crypto/openssl.


Patch attached with submission follows:

Index: modules/aesni/Makefile
===================================================================
--- modules/aesni/Makefile	(revision 238810)
+++ modules/aesni/Makefile	(working copy)
@@ -4,7 +4,15 @@
 
 KMOD=	aesni
 SRCS=	aesni.c aesni_wrap.c
-SRCS+=	aesencdec_${MACHINE_CPUARCH}.S aeskeys_${MACHINE_CPUARCH}.S
+# SRCS+=	aesencdec_${MACHINE_CPUARCH}.S aeskeys_${MACHINE_CPUARCH}.S
+SRCS+=	aesencdec_${MACHINE_CPUARCH}.S aesni-xts-${MACHINE_CPUARCH}.s
 SRCS+=	device_if.h bus_if.h opt_bus.h cryptodev_if.h
 
+CLEANFILES+=	aesni-xts-${MACHINE_CPUARCH}.s
+
+aesni-xts-amd64.s: 
+	perl ${.CURDIR}/../../../crypto/openssl/crypto/aes/asm/aesni-x86_64.pl elf > ${.TARGET}
+aesni-xts-i386.s: 
+	perl ${.CURDIR}/../../../crypto/openssl/crypto/aes/asm/aesni-x86.pl elf > ${.TARGET}
+
 .include <bsd.kmod.mk>
Index: crypto/aesni/aesni.h
===================================================================
--- crypto/aesni/aesni.h	(revision 238810)
+++ crypto/aesni/aesni.h	(working copy)
@@ -52,13 +52,18 @@
 #define	AES256_ROUNDS	14
 #define	AES_SCHED_LEN	((AES256_ROUNDS + 1) * AES_BLOCK_LEN)
 
+struct aesni_sched {
+	uint8_t ks[AES_SCHED_LEN] __aligned(16);
+	int rounds;
+};
+
 struct aesni_session {
-	uint8_t enc_schedule[AES_SCHED_LEN] __aligned(16);
-	uint8_t dec_schedule[AES_SCHED_LEN] __aligned(16);
-	uint8_t xts_schedule[AES_SCHED_LEN] __aligned(16);
+	struct aesni_sched enc_sched __aligned(32);
+	struct aesni_sched dec_sched __aligned(32);
+	struct aesni_sched xts_sched __aligned(32);
 	uint8_t iv[AES_BLOCK_LEN];
 	int algo;
-	int rounds;
+	/* int rounds; */
 	/* uint8_t *ses_ictx; */
 	/* uint8_t *ses_octx; */
 	/* int ses_mlen; */
@@ -77,10 +82,16 @@
 void aesni_dec(int rounds, const uint8_t *key_schedule,
     const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN],
     const uint8_t iv[AES_BLOCK_LEN]);
-void aesni_set_enckey(const uint8_t *userkey, uint8_t *encrypt_schedule,
-    int number_of_rounds);
-void aesni_set_deckey(const uint8_t *encrypt_schedule,
-    uint8_t *decrypt_schedule, int number_of_rounds);
+int aesni_set_encrypt_key(const uint8_t *userkey, int keylen,
+    struct aesni_sched *sched);
+int aesni_set_decrypt_key(const uint8_t *userkey, int keylen,
+    struct aesni_sched *sched);
+void aesni_xts_encrypt(uint8_t *from, uint8_t *to, size_t len,
+    struct aesni_sched *key1, struct aesni_sched *key2,
+    uint8_t iv[AES_BLOCK_LEN]);
+void aesni_xts_decrypt(uint8_t *from, uint8_t *to, size_t len,
+    struct aesni_sched *key1, struct aesni_sched *key2,
+    uint8_t iv[AES_BLOCK_LEN]);
 
 /*
  * Slightly more public interfaces.
Index: crypto/aesni/aesni_wrap.c
===================================================================
--- crypto/aesni/aesni_wrap.c	(revision 238810)
+++ crypto/aesni/aesni_wrap.c	(working copy)
@@ -82,6 +82,7 @@
 	}
 }
 
+#if 0
 #define	AES_XTS_BLOCKSIZE	16
 #define	AES_XTS_IVSIZE		8
 #define	AES_XTS_ALPHA		0x87	/* GF(2^128) generator polynomial */
@@ -169,6 +170,7 @@
 	aesni_crypt_xts(rounds, data_schedule, tweak_schedule, len, from, to,
 	    iv, 0);
 }
+#endif
 
 static int
 aesni_cipher_setup_common(struct aesni_session *ses, const uint8_t *key,
@@ -179,13 +181,8 @@
 	case CRYPTO_AES_CBC:
 		switch (keylen) {
 		case 128:
-			ses->rounds = AES128_ROUNDS;
-			break;
 		case 192:
-			ses->rounds = AES192_ROUNDS;
-			break;
 		case 256:
-			ses->rounds = AES256_ROUNDS;
 			break;
 		default:
 			return (EINVAL);
@@ -194,10 +191,8 @@
 	case CRYPTO_AES_XTS:
 		switch (keylen) {
 		case 256:
-			ses->rounds = AES128_ROUNDS;
-			break;
 		case 512:
-			ses->rounds = AES256_ROUNDS;
+			keylen >>= 1;
 			break;
 		default:
 			return (EINVAL);
@@ -207,13 +202,13 @@
 		return (EINVAL);
 	}
 
-	aesni_set_enckey(key, ses->enc_schedule, ses->rounds);
-	aesni_set_deckey(ses->enc_schedule, ses->dec_schedule, ses->rounds);
+	aesni_set_encrypt_key(key, keylen, &ses->enc_sched);
+	aesni_set_decrypt_key(key, keylen, &ses->dec_sched);
 	if (ses->algo == CRYPTO_AES_CBC)
 		arc4rand(ses->iv, sizeof(ses->iv), 0);
 	else /* if (ses->algo == CRYPTO_AES_XTS) */ {
-		aesni_set_enckey(key + keylen / 16, ses->xts_schedule,
-		    ses->rounds);
+		aesni_set_encrypt_key(key + (keylen >> 3), keylen,
+		    &ses->xts_sched);
 	}
 
 	return (0);
@@ -279,12 +274,12 @@
 			crypto_copyback(crp->crp_flags, crp->crp_buf,
 			    enccrd->crd_inject, AES_BLOCK_LEN, ses->iv);
 		if (ses->algo == CRYPTO_AES_CBC) {
-			aesni_encrypt_cbc(ses->rounds, ses->enc_schedule,
-			    enccrd->crd_len, buf, buf, ses->iv);
+			aesni_encrypt_cbc(ses->enc_sched.rounds + 1,
+			    &ses->enc_sched, enccrd->crd_len, buf, buf,
+			    ses->iv);
 		} else /* if (ses->algo == CRYPTO_AES_XTS) */ {
-			aesni_encrypt_xts(ses->rounds, ses->enc_schedule,
-			    ses->xts_schedule, enccrd->crd_len, buf, buf,
-			    ses->iv);
+			aesni_xts_encrypt(buf, buf, enccrd->crd_len,
+			    &ses->enc_sched, &ses->xts_sched, ses->iv);
 		}
 	} else {
 		if ((enccrd->crd_flags & CRD_F_IV_EXPLICIT) != 0)
@@ -293,12 +288,11 @@
 			crypto_copydata(crp->crp_flags, crp->crp_buf,
 			    enccrd->crd_inject, AES_BLOCK_LEN, ses->iv);
 		if (ses->algo == CRYPTO_AES_CBC) {
-			aesni_decrypt_cbc(ses->rounds, ses->dec_schedule,
-			    enccrd->crd_len, buf, ses->iv);
+			aesni_decrypt_cbc(ses->dec_sched.rounds + 1,
+			    &ses->dec_sched, enccrd->crd_len, buf, ses->iv);
 		} else /* if (ses->algo == CRYPTO_AES_XTS) */ {
-			aesni_decrypt_xts(ses->rounds, ses->dec_schedule,
-			    ses->xts_schedule, enccrd->crd_len, buf, buf,
-			    ses->iv);
+			aesni_xts_decrypt(buf, buf, enccrd->crd_len,
+			    &ses->dec_sched, &ses->xts_sched, ses->iv);
 		}
 	}
 	if (saved_ctx)

============= END DIFF =================

aesni@revision 238810:
   3.937 sec, 2000000    aes crypts,      16 bytes,  8128621 byte/sec,    62.0 Mb/sec
   4.105 sec, 2000000    xts crypts,      16 bytes,  7795364 byte/sec,    59.5 Mb/sec
   3.911 sec, 2000000 aes192 crypts,      16 bytes,  8182867 byte/sec,    62.4 Mb/sec
   3.912 sec, 2000000 aes256 crypts,      16 bytes,  8180568 byte/sec,    62.4 Mb/sec
   4.157 sec, 2000000 xts-256 crypts,      16 bytes,  7697874 byte/sec,    58.7 Mb/sec
   5.574 sec, 2000000    aes crypts,     512 bytes, 183701283 byte/sec,  1401.5 Mb/sec
   7.064 sec, 2000000    xts crypts,     512 bytes, 144957859 byte/sec,  1105.9 Mb/sec
   5.793 sec, 2000000 aes192 crypts,     512 bytes, 176753253 byte/sec,  1348.5 Mb/sec
   6.032 sec, 2000000 aes256 crypts,     512 bytes, 169761639 byte/sec,  1295.2 Mb/sec
   7.720 sec, 2000000 xts-256 crypts,     512 bytes, 132636989 byte/sec,  1011.9 Mb/sec
   1.631 sec,  200000    aes crypts,    4096 bytes, 502211582 byte/sec,  3831.6 Mb/sec
   2.690 sec,  200000    xts crypts,    4096 bytes, 304562262 byte/sec,  2323.6 Mb/sec
   1.817 sec,  200000 aes192 crypts,    4096 bytes, 450952825 byte/sec,  3440.5 Mb/sec
   2.001 sec,  200000 aes256 crypts,    4096 bytes, 409313072 byte/sec,  3122.8 Mb/sec
   3.231 sec,  200000 xts-256 crypts,    4096 bytes, 253510291 byte/sec,  1934.1 Mb/sec
aesni_openssl:
   3.927 sec, 2000000    aes crypts,      16 bytes,  8147903 byte/sec,    62.2 Mb/sec
   3.973 sec, 2000000    xts crypts,      16 bytes,  8054614 byte/sec,    61.5 Mb/sec
   3.936 sec, 2000000 aes192 crypts,      16 bytes,  8130494 byte/sec,    62.0 Mb/sec
   3.928 sec, 2000000 aes256 crypts,      16 bytes,  8147643 byte/sec,    62.2 Mb/sec
   4.025 sec, 2000000 xts-256 crypts,      16 bytes,  7950927 byte/sec,    60.7 Mb/sec
   5.500 sec, 2000000    aes crypts,     512 bytes, 186165740 byte/sec,  1420.3 Mb/sec
   4.820 sec, 2000000    xts crypts,     512 bytes, 212429975 byte/sec,  1620.7 Mb/sec
   5.769 sec, 2000000 aes192 crypts,     512 bytes, 177498680 byte/sec,  1354.2 Mb/sec
   5.981 sec, 2000000 aes256 crypts,     512 bytes, 171220193 byte/sec,  1306.3 Mb/sec
   4.956 sec, 2000000 xts-256 crypts,     512 bytes, 206606318 byte/sec,  1576.3 Mb/sec
   1.625 sec,  200000    aes crypts,    4096 bytes, 504053595 byte/sec,  3845.6 Mb/sec
   0.956 sec,  200000    xts crypts,    4096 bytes, 856864328 byte/sec,  6537.4 Mb/sec
   1.813 sec,  200000 aes192 crypts,    4096 bytes, 451837050 byte/sec,  3447.2 Mb/sec
   2.001 sec,  200000 aes256 crypts,    4096 bytes, 409391620 byte/sec,  3123.4 Mb/sec
   1.044 sec,  200000 xts-256 crypts,    4096 bytes, 784530799 byte/sec,  5985.5 Mb/sec


>Release-Note:
>Audit-Trail:
>Unformatted:



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201207261813.q6QIDo7R040201>