From owner-svn-src-all@FreeBSD.ORG Sat Nov 16 09:01:25 2013 Return-Path: Delivered-To: svn-src-all@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [8.8.178.115]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by hub.freebsd.org (Postfix) with ESMTPS id 41578925; Sat, 16 Nov 2013 09:01:25 +0000 (UTC) Received: from svn.freebsd.org (svn.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by mx1.freebsd.org (Postfix) with ESMTPS id 2EA9F2598; Sat, 16 Nov 2013 09:01:25 +0000 (UTC) Received: from svn.freebsd.org ([127.0.1.70]) by svn.freebsd.org (8.14.7/8.14.7) with ESMTP id rAG91PEi025569; Sat, 16 Nov 2013 09:01:25 GMT (envelope-from jmg@svn.freebsd.org) Received: (from jmg@localhost) by svn.freebsd.org (8.14.7/8.14.5/Submit) id rAG91O4g025564; Sat, 16 Nov 2013 09:01:24 GMT (envelope-from jmg@svn.freebsd.org) Message-Id: <201311160901.rAG91O4g025564@svn.freebsd.org> From: John-Mark Gurney Date: Sat, 16 Nov 2013 09:01:24 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-stable@freebsd.org, svn-src-stable-10@freebsd.org Subject: svn commit: r258212 - stable/10/sys/crypto/aesni X-SVN-Group: stable-10 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.16 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 16 Nov 2013 09:01:25 -0000 Author: jmg Date: Sat Nov 16 09:01:24 2013 New Revision: 258212 URL: http://svnweb.freebsd.org/changeset/base/258212 Log: MFC r257757: fix issues w/ AES-NI on unaligned data blocks... Approved by: re (kib) Modified: stable/10/sys/crypto/aesni/aesencdec.h stable/10/sys/crypto/aesni/aesni.h stable/10/sys/crypto/aesni/aesni_wrap.c Directory Properties: stable/10/sys/ (props changed) Modified: stable/10/sys/crypto/aesni/aesencdec.h ============================================================================== --- stable/10/sys/crypto/aesni/aesencdec.h Sat Nov 16 08:28:14 2013 (r258211) +++ stable/10/sys/crypto/aesni/aesencdec.h Sat Nov 16 09:01:24 2013 (r258212) @@ -30,11 +30,10 @@ #include static inline void -aesni_enc8(int rounds, const uint8_t *key_schedule, __m128i a, +aesni_enc8(int rounds, const __m128i *keysched, __m128i a, __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g, __m128i h, __m128i out[8]) { - const __m128i *keysched = (const __m128i *)key_schedule; int i; a ^= keysched[0]; @@ -68,11 +67,10 @@ aesni_enc8(int rounds, const uint8_t *ke } static inline void -aesni_dec8(int rounds, const uint8_t *key_schedule, __m128i a, +aesni_dec8(int rounds, const __m128i *keysched, __m128i a, __m128i b, __m128i c, __m128i d, __m128i e, __m128i f, __m128i g, __m128i h, __m128i out[8]) { - const __m128i *keysched = (const __m128i *)key_schedule; int i; a ^= keysched[0]; @@ -106,10 +104,9 @@ aesni_dec8(int rounds, const uint8_t *ke } static inline __m128i -aesni_enc(int rounds, const uint8_t *key_schedule, const __m128i from) +aesni_enc(int rounds, const __m128i *keysched, const __m128i from) { __m128i tmp; - const __m128i *keysched = (const __m128i *)key_schedule; int i; tmp = from ^ keysched[0]; @@ -121,10 +118,9 @@ aesni_enc(int rounds, const uint8_t *key } static inline __m128i -aesni_dec(int rounds, const uint8_t *key_schedule, const __m128i from) +aesni_dec(int rounds, const __m128i *keysched, const __m128i from) { __m128i tmp; - const __m128i *keysched = (const __m128i *)key_schedule; int i; tmp = from ^ keysched[0]; Modified: stable/10/sys/crypto/aesni/aesni.h ============================================================================== --- stable/10/sys/crypto/aesni/aesni.h Sat Nov 16 08:28:14 2013 (r258211) +++ stable/10/sys/crypto/aesni/aesni.h Sat Nov 16 09:01:24 2013 (r258212) @@ -71,29 +71,30 @@ struct aesni_session { /* * Internal functions, implemented in assembler. */ -void aesni_set_enckey(const uint8_t *userkey, uint8_t *encrypt_schedule, - int number_of_rounds); -void aesni_set_deckey(const uint8_t *encrypt_schedule, - uint8_t *decrypt_schedule, int number_of_rounds); +void aesni_set_enckey(const uint8_t *userkey, + uint8_t *encrypt_schedule /*__aligned(16)*/, int number_of_rounds); +void aesni_set_deckey(const uint8_t *encrypt_schedule /*__aligned(16)*/, + uint8_t *decrypt_schedule /*__aligned(16)*/, int number_of_rounds); /* * Slightly more public interfaces. */ -void aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, - const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); -void aesni_decrypt_cbc(int rounds, const void *key_schedule, size_t len, - uint8_t *buf, const uint8_t iv[AES_BLOCK_LEN]); -void aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, - const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]); -void aesni_decrypt_ecb(int rounds, const void *key_schedule, size_t len, - const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]); - -void aesni_encrypt_xts(int rounds, const void *data_schedule, - const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, - const uint8_t iv[AES_BLOCK_LEN]); -void aesni_decrypt_xts(int rounds, const void *data_schedule, - const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, +void aesni_encrypt_cbc(int rounds, const void *key_schedule /*__aligned(16)*/, + size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); +void aesni_decrypt_cbc(int rounds, const void *key_schedule /*__aligned(16)*/, + size_t len, uint8_t *buf, const uint8_t iv[AES_BLOCK_LEN]); +void aesni_encrypt_ecb(int rounds, const void *key_schedule /*__aligned(16)*/, + size_t len, const uint8_t *from, uint8_t *to); +void aesni_decrypt_ecb(int rounds, const void *key_schedule /*__aligned(16)*/, + size_t len, const uint8_t *from, uint8_t *to); + +void aesni_encrypt_xts(int rounds, const void *data_schedule /*__aligned(16)*/, + const void *tweak_schedule /*__aligned(16)*/, size_t len, + const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); +void aesni_decrypt_xts(int rounds, const void *data_schedule /*__aligned(16)*/, + const void *tweak_schedule /*__aligned(16)*/, size_t len, + const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]); int aesni_cipher_setup(struct aesni_session *ses, struct cryptoini *encini); @@ -103,4 +104,4 @@ int aesni_cipher_process(struct aesni_se uint8_t *aesni_cipher_alloc(struct cryptodesc *enccrd, struct cryptop *crp, int *allocated); -#endif +#endif /* _AESNI_H_ */ Modified: stable/10/sys/crypto/aesni/aesni_wrap.c ============================================================================== --- stable/10/sys/crypto/aesni/aesni_wrap.c Sat Nov 16 08:28:14 2013 (r258211) +++ stable/10/sys/crypto/aesni/aesni_wrap.c Sat Nov 16 09:01:24 2013 (r258212) @@ -41,6 +41,10 @@ __FBSDID("$FreeBSD$"); MALLOC_DECLARE(M_AESNI); +struct blocks8 { + __m128i blk[8]; +} __packed; + void aesni_encrypt_cbc(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]) @@ -65,20 +69,20 @@ aesni_decrypt_cbc(int rounds, const void uint8_t *buf, const uint8_t iv[AES_BLOCK_LEN]) { __m128i blocks[8]; - __m128i *bufs; + struct blocks8 *blks; __m128i ivreg, nextiv; size_t i, j, cnt; ivreg = _mm_loadu_si128((const __m128i *)iv); cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { - bufs = (__m128i *)buf; - aesni_dec8(rounds - 1, key_schedule, bufs[0], bufs[1], - bufs[2], bufs[3], bufs[4], bufs[5], bufs[6], - bufs[7], &blocks[0]); + blks = (struct blocks8 *)buf; + aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], + blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], + blks->blk[6], blks->blk[7], &blocks[0]); for (j = 0; j < 8; j++) { - nextiv = bufs[j]; - bufs[j] = blocks[j] ^ ivreg; + nextiv = blks->blk[j]; + blks->blk[j] = blocks[j] ^ ivreg; ivreg = nextiv; } buf += AES_BLOCK_LEN * 8; @@ -86,9 +90,9 @@ aesni_decrypt_cbc(int rounds, const void i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { - bufs = (__m128i *)buf; - nextiv = bufs[0]; - bufs[0] = aesni_dec(rounds - 1, key_schedule, bufs[0]) ^ ivreg; + nextiv = _mm_loadu_si128((void *)buf); + _mm_storeu_si128((void *)buf, + aesni_dec(rounds - 1, key_schedule, nextiv) ^ ivreg); ivreg = nextiv; buf += AES_BLOCK_LEN; } @@ -99,15 +103,26 @@ aesni_encrypt_ecb(int rounds, const void const uint8_t *from, uint8_t *to) { __m128i tot; - const __m128i *blocks; + __m128i tout[8]; + struct blocks8 *top; + const struct blocks8 *blks; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { - blocks = (const __m128i *)from; - aesni_enc8(rounds - 1, key_schedule, blocks[0], blocks[1], - blocks[2], blocks[3], blocks[4], blocks[5], blocks[6], - blocks[7], (__m128i *)to); + blks = (const struct blocks8 *)from; + top = (struct blocks8 *)to; + aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], + blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], + blks->blk[6], blks->blk[7], tout); + top->blk[0] = tout[0]; + top->blk[1] = tout[1]; + top->blk[2] = tout[2]; + top->blk[3] = tout[3]; + top->blk[4] = tout[4]; + top->blk[5] = tout[5]; + top->blk[6] = tout[6]; + top->blk[7] = tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } @@ -127,15 +142,26 @@ aesni_decrypt_ecb(int rounds, const void const uint8_t from[AES_BLOCK_LEN], uint8_t to[AES_BLOCK_LEN]) { __m128i tot; - const __m128i *blocks; + __m128i tout[8]; + const struct blocks8 *blks; + struct blocks8 *top; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { - blocks = (const __m128i *)from; - aesni_dec8(rounds - 1, key_schedule, blocks[0], blocks[1], - blocks[2], blocks[3], blocks[4], blocks[5], blocks[6], - blocks[7], (__m128i *)to); + blks = (const struct blocks8 *)from; + top = (struct blocks8 *)to; + aesni_dec8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], + blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], + blks->blk[6], blks->blk[7], tout); + top->blk[0] = tout[0]; + top->blk[1] = tout[1]; + top->blk[2] = tout[2]; + top->blk[3] = tout[3]; + top->blk[4] = tout[4]; + top->blk[5] = tout[5]; + top->blk[6] = tout[6]; + top->blk[7] = tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } @@ -173,31 +199,33 @@ xts_crank_lfsr(__m128i inp) } static void -aesni_crypt_xts_block(int rounds, const void *key_schedule, __m128i *tweak, - const __m128i *from, __m128i *to, int do_encrypt) +aesni_crypt_xts_block(int rounds, const __m128i *key_schedule, __m128i *tweak, + const uint8_t *from, uint8_t *to, int do_encrypt) { __m128i block; - block = *from ^ *tweak; + block = _mm_loadu_si128((const __m128i *)from) ^ *tweak; if (do_encrypt) block = aesni_enc(rounds - 1, key_schedule, block); else block = aesni_dec(rounds - 1, key_schedule, block); - *to = block ^ *tweak; + _mm_storeu_si128((__m128i *)to, block ^ *tweak); *tweak = xts_crank_lfsr(*tweak); } static void -aesni_crypt_xts_block8(int rounds, const void *key_schedule, __m128i *tweak, - const __m128i *from, __m128i *to, int do_encrypt) +aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak, + const uint8_t *from, uint8_t *to, int do_encrypt) { __m128i tmptweak; __m128i a, b, c, d, e, f, g, h; __m128i tweaks[8]; __m128i tmp[8]; + __m128i *top; + const __m128i *fromp; tmptweak = *tweak; @@ -205,10 +233,12 @@ aesni_crypt_xts_block8(int rounds, const * unroll the loop. This lets gcc put values directly in the * register and saves memory accesses. */ + fromp = (const __m128i *)from; #define PREPINP(v, pos) \ do { \ tweaks[(pos)] = tmptweak; \ - (v) = from[(pos)] ^ tmptweak; \ + (v) = _mm_loadu_si128(&fromp[pos]) ^ \ + tmptweak; \ tmptweak = xts_crank_lfsr(tmptweak); \ } while (0) PREPINP(a, 0); @@ -228,20 +258,21 @@ aesni_crypt_xts_block8(int rounds, const aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, tmp); - to[0] = tmp[0] ^ tweaks[0]; - to[1] = tmp[1] ^ tweaks[1]; - to[2] = tmp[2] ^ tweaks[2]; - to[3] = tmp[3] ^ tweaks[3]; - to[4] = tmp[4] ^ tweaks[4]; - to[5] = tmp[5] ^ tweaks[5]; - to[6] = tmp[6] ^ tweaks[6]; - to[7] = tmp[7] ^ tweaks[7]; + top = (__m128i *)to; + _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]); + _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]); + _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]); + _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]); + _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]); + _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]); + _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]); + _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]); } static void -aesni_crypt_xts(int rounds, const void *data_schedule, - const void *tweak_schedule, size_t len, const uint8_t *from, uint8_t *to, - const uint8_t iv[AES_BLOCK_LEN], int do_encrypt) +aesni_crypt_xts(int rounds, const __m128i *data_schedule, + const __m128i *tweak_schedule, size_t len, const uint8_t *from, + uint8_t *to, const uint8_t iv[AES_BLOCK_LEN], int do_encrypt) { __m128i tweakreg; uint8_t tweak[AES_XTS_BLOCKSIZE] __aligned(16); @@ -264,7 +295,7 @@ aesni_crypt_xts(int rounds, const void * cnt = len / AES_XTS_BLOCKSIZE / 8; for (i = 0; i < cnt; i++) { aesni_crypt_xts_block8(rounds, data_schedule, &tweakreg, - (const __m128i *)from, (__m128i *)to, do_encrypt); + from, to, do_encrypt); from += AES_XTS_BLOCKSIZE * 8; to += AES_XTS_BLOCKSIZE * 8; } @@ -272,7 +303,7 @@ aesni_crypt_xts(int rounds, const void * cnt = len / AES_XTS_BLOCKSIZE; for (; i < cnt; i++) { aesni_crypt_xts_block(rounds, data_schedule, &tweakreg, - (const __m128i *)from, (__m128i *)to, do_encrypt); + from, to, do_encrypt); from += AES_XTS_BLOCKSIZE; to += AES_XTS_BLOCKSIZE; }