Date: Mon, 7 Apr 2003 23:18:27 -0700 (PDT) From: Peter Wemm <peter@FreeBSD.org> To: Perforce Change Reviews <perforce@freebsd.org> Subject: PERFORCE change 28488 for review Message-ID: <200304080618.h386IRwB093608@repoman.freebsd.org>
next in thread | raw e-mail | index | archive | help
http://perforce.freebsd.org/chv.cgi?CH=28488 Change 28488 by peter@peter_daintree on 2003/04/07 23:17:56 use the portable in_cksum from powerpc for now Affected files ... .. //depot/projects/hammer/sys/x86_64/include/in_cksum.h#2 edit .. //depot/projects/hammer/sys/x86_64/x86_64/in_cksum.c#2 edit Differences ... ==== //depot/projects/hammer/sys/x86_64/include/in_cksum.h#2 (text+ko) ==== @@ -33,16 +33,12 @@ * from tahoe: in_cksum.c 1.2 86/01/05 * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 * from: Id: in_cksum.c,v 1.8 1995/12/03 18:35:19 bde Exp - * $FreeBSD: src/sys/i386/include/in_cksum.h,v 1.13 2002/06/22 22:35:53 jdp Exp $ + * $FreeBSD: src/sys/powerpc/include/in_cksum.h,v 1.1 2002/06/29 09:49:24 benno Exp $ */ #ifndef _MACHINE_IN_CKSUM_H_ #define _MACHINE_IN_CKSUM_H_ 1 -/* - * MP safe (alfred) - */ - #include <sys/cdefs.h> #define in_cksum(m, len) in_cksum_skip(m, len, 0) @@ -54,33 +50,6 @@ * therefore always exactly five 32-bit words. */ #ifdef __GNUC__ -static __inline u_int -in_cksum_hdr(const struct ip *ip) -{ - register u_int sum = 0; - -/* __volatile is necessary here because the condition codes are used. */ -#define ADD(n) __asm __volatile ("addl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)ip)[n / 4])) -#define ADDC(n) __asm __volatile ("adcl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)ip)[n / 4])) -#define MOP __asm __volatile ("adcl $0, %0" : "+r" (sum)) - - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(16); - MOP; -#undef ADD -#undef ADDC -#undef MOP - sum = (sum & 0xffff) + (sum >> 16); - if (sum > 0xffff) - sum -= 0xffff; - - return ~sum & 0xffff; -} static __inline void in_cksum_update(struct ip *ip) @@ -90,32 +59,8 @@ ip->ip_sum = htons(__tmpsum + (__tmpsum >> 16)); } -static __inline u_short -in_addword(u_short sum, u_short b) -{ - /* __volatile is necessary because the condition codes are used. */ - __asm __volatile ("addw %1, %0" : "+r" (sum) : "r" (b)); - __asm __volatile ("adcw $0, %0" : "+r" (sum)); +#else - return (sum); -} - -static __inline u_short -in_pseudo(u_int sum, u_int b, u_int c) -{ - /* __volatile is necessary because the condition codes are used. */ - __asm __volatile ("addl %1, %0" : "+r" (sum) : "g" (b)); - __asm __volatile ("adcl %1, %0" : "+r" (sum) : "g" (c)); - __asm __volatile ("adcl $0, %0" : "+r" (sum)); - - sum = (sum & 0xffff) + (sum >> 16); - if (sum > 0xffff) - sum -= 0xffff; - return (sum); -} - -#else -u_int in_cksum_hdr(const struct ip *); #define in_cksum_update(ip) \ do { \ int __tmpsum; \ @@ -126,7 +71,10 @@ #endif #ifdef _KERNEL -u_short in_cksum_skip(struct mbuf *m, int len, int skip); -#endif /* _KERNEL */ +u_int in_cksum_hdr(const struct ip *ip); +u_short in_addword(u_short sum, u_short b); +u_short in_pseudo(u_int sum, u_int b, u_int c); +u_short in_cksum_skip(struct mbuf *m, int len, int skip); +#endif #endif /* _MACHINE_IN_CKSUM_H_ */ ==== //depot/projects/hammer/sys/x86_64/x86_64/in_cksum.c#2 (text+ko) ==== @@ -1,6 +1,11 @@ -/*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. +/* $FreeBSD: src/sys/powerpc/powerpc/in_cksum.c,v 1.2 2003/02/13 08:56:41 grehan Exp $ */ +/* $NetBSD: in_cksum.c,v 1.7 1997/09/02 13:18:15 thorpej Exp $ */ + +/* + * Copyright (c) 1988, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 1996 + * Matt Thomas <matt@3am-software.com> * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,241 +35,216 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from tahoe: in_cksum.c 1.2 86/01/05 - * from: @(#)in_cksum.c 1.3 (Berkeley) 1/19/91 - * $FreeBSD: src/sys/i386/i386/in_cksum.c,v 1.24 2002/06/22 22:35:49 jdp Exp $ + * @(#)in_cksum.c 8.1 (Berkeley) 6/10/93 */ -/* - * MPsafe: alfred - */ +#include <sys/cdefs.h> /* RCS ID & Copyright macro defns */ + #include <sys/param.h> +#include <sys/mbuf.h> #include <sys/systm.h> -#include <sys/mbuf.h> - +#include <netinet/in_systm.h> #include <netinet/in.h> -#include <netinet/in_systm.h> #include <netinet/ip.h> - #include <machine/in_cksum.h> /* - * Checksum routine for Internet Protocol family headers. + * Checksum routine for Internet Protocol family headers + * (Portable Alpha version). * * This routine is very heavily used in the network * code and should be modified for each CPU to be as fast as possible. - * - * This implementation is 386 version. */ -#undef ADDCARRY -#define ADDCARRY(x) if ((x) > 0xffff) (x) -= 0xffff -#define REDUCE {sum = (sum & 0xffff) + (sum >> 16); ADDCARRY(sum);} +#define ADDCARRY(x) (x > 65535 ? x -= 65535 : x) +#define REDUCE32 \ + { \ + q_util.q = sum; \ + sum = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \ + } +#define REDUCE16 \ + { \ + q_util.q = sum; \ + l_util.l = q_util.s[0] + q_util.s[1] + q_util.s[2] + q_util.s[3]; \ + sum = l_util.s[0] + l_util.s[1]; \ + ADDCARRY(sum); \ + } + +static const u_int32_t in_masks[] = { +#if 0 + /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/ + 0x00000000, 0x000000FF, 0x0000FFFF, 0x00FFFFFF, /* offset 0 */ + 0x00000000, 0x0000FF00, 0x00FFFF00, 0xFFFFFF00, /* offset 1 */ + 0x00000000, 0x00FF0000, 0xFFFF0000, 0xFFFF0000, /* offset 2 */ + 0x00000000, 0xFF000000, 0xFF000000, 0xFF000000, /* offset 3 */ +#else + /*0 bytes*/ /*1 byte*/ /*2 bytes*/ /*3 bytes*/ + 0x00000000, 0xFF000000, 0xFFFF0000, 0xFFFFFF00, /* offset 0 */ + 0x00000000, 0x00FF0000, 0x00FFFF00, 0x00FFFFFF, /* offset 1 */ + 0x00000000, 0x0000FF00, 0x0000FFFF, 0x0000FFFF, /* offset 2 */ + 0x00000000, 0x000000FF, 0x000000FF, 0x000000FF, /* offset 3 */ +#endif +}; -/* - * These asm statements require __volatile because they pass information - * via the condition codes. GCC does not currently provide a way to specify - * the condition codes as an input or output operand. - * - * The LOAD macro below is effectively a prefetch into cache. GCC will - * load the value into a register but will not use it. Since modern CPUs - * reorder operations, this will generally take place in parallel with - * other calculations. - */ -#define ADD(n) __asm __volatile \ - ("addl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)w)[n / 4])) -#define ADDC(n) __asm __volatile \ - ("adcl %1, %0" : "+r" (sum) : \ - "g" (((const u_int32_t *)w)[n / 4])) -#define LOAD(n) __asm __volatile \ - ("" : : "r" (((const u_int32_t *)w)[n / 4])) -#define MOP __asm __volatile \ - ("adcl $0, %0" : "+r" (sum)) +union l_util { + u_int16_t s[2]; + u_int32_t l; +}; +union q_util { + u_int16_t s[4]; + u_int32_t l[2]; + u_int64_t q; +}; -u_short -in_cksum_skip(m, len, skip) - struct mbuf *m; - int len; - int skip; +static u_int64_t +in_cksumdata(const void *buf, int len) { - register u_short *w; - register unsigned sum = 0; - register int mlen = 0; - int byte_swapped = 0; - union { char c[2]; u_short s; } su; + const u_int32_t *lw = (const u_int32_t *) buf; + u_int64_t sum = 0; + u_int64_t prefilled; + int offset; + union q_util q_util; + + if ((3 & (long) lw) == 0 && len == 20) { + sum = (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3] + lw[4]; + REDUCE32; + return sum; + } - len -= skip; - for (; skip && m; m = m->m_next) { - if (m->m_len > skip) { - mlen = m->m_len - skip; - w = (u_short *)(mtod(m, u_char *) + skip); - goto skip_start; - } else { - skip -= m->m_len; + if ((offset = 3 & (long) lw) != 0) { + const u_int32_t *masks = in_masks + (offset << 2); + lw = (u_int32_t *) (((long) lw) - offset); + sum = *lw++ & masks[len >= 3 ? 3 : len]; + len -= 4 - offset; + if (len <= 0) { + REDUCE32; + return sum; + } + } +#if 0 + /* + * Force to cache line boundary. + */ + offset = 32 - (0x1f & (long) lw); + if (offset < 32 && len > offset) { + len -= offset; + if (4 & offset) { + sum += (u_int64_t) lw[0]; + lw += 1; + } + if (8 & offset) { + sum += (u_int64_t) lw[0] + lw[1]; + lw += 2; + } + if (16 & offset) { + sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3]; + lw += 4; } } +#endif + /* + * access prefilling to start load of next cache line. + * then add current cache line + * save result of prefilling for loop iteration. + */ + prefilled = lw[0]; + while ((len -= 32) >= 4) { + u_int64_t prefilling = lw[8]; + sum += prefilled + lw[1] + lw[2] + lw[3] + + lw[4] + lw[5] + lw[6] + lw[7]; + lw += 8; + prefilled = prefilling; + } + if (len >= 0) { + sum += prefilled + lw[1] + lw[2] + lw[3] + + lw[4] + lw[5] + lw[6] + lw[7]; + lw += 8; + } else { + len += 32; + } + while ((len -= 16) >= 0) { + sum += (u_int64_t) lw[0] + lw[1] + lw[2] + lw[3]; + lw += 4; + } + len += 16; + while ((len -= 4) >= 0) { + sum += (u_int64_t) *lw++; + } + len += 4; + if (len > 0) + sum += (u_int64_t) (in_masks[len] & *lw); + REDUCE32; + return sum; +} + +u_short +in_addword(u_short a, u_short b) +{ + u_int64_t sum = a + b; + + ADDCARRY(sum); + return (sum); +} + +u_short +in_pseudo(u_int32_t a, u_int32_t b, u_int32_t c) +{ + u_int64_t sum; + union q_util q_util; + union l_util l_util; + + sum = (u_int64_t) a + b + c; + REDUCE16; + return (sum); +} + +u_short +in_cksum_skip(struct mbuf *m, int len, int skip) +{ + u_int64_t sum = 0; + int mlen = 0; + int clen = 0; + caddr_t addr; + union q_util q_util; + union l_util l_util; - for (;m && len; m = m->m_next) { + len -= skip; + for (; skip && m; m = m->m_next) { + if (m->m_len > skip) { + mlen = m->m_len - skip; + addr = mtod(m, caddr_t) + skip; + goto skip_start; + } else { + skip -= m->m_len; + } + } + + for (; m && len; m = m->m_next) { if (m->m_len == 0) continue; - w = mtod(m, u_short *); - if (mlen == -1) { - /* - * The first byte of this mbuf is the continuation - * of a word spanning between this mbuf and the - * last mbuf. - */ - - /* su.c[0] is already saved when scanning previous - * mbuf. sum was REDUCEd when we found mlen == -1 - */ - su.c[1] = *(u_char *)w; - sum += su.s; - w = (u_short *)((char *)w + 1); - mlen = m->m_len - 1; - len--; - } else - mlen = m->m_len; + mlen = m->m_len; + addr = mtod(m, caddr_t); skip_start: if (len < mlen) mlen = len; + + if ((clen ^ (long) addr) & 1) + sum += in_cksumdata(addr, mlen) << 8; + else + sum += in_cksumdata(addr, mlen); + + clen += mlen; len -= mlen; - /* - * Force to long boundary so we do longword aligned - * memory operations - */ - if (3 & (int) w) { - REDUCE; - if ((1 & (int) w) && (mlen > 0)) { - sum <<= 8; - su.c[0] = *(char *)w; - w = (u_short *)((char *)w + 1); - mlen--; - byte_swapped = 1; - } - if ((2 & (int) w) && (mlen >= 2)) { - sum += *w++; - mlen -= 2; - } - } - /* - * Advance to a 486 cache line boundary. - */ - if (4 & (int) w && mlen >= 4) { - ADD(0); - MOP; - w += 2; - mlen -= 4; - } - if (8 & (int) w && mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - /* - * Do as much of the checksum as possible 32 bits at at time. - * In fact, this loop is unrolled to make overhead from - * branches &c small. - */ - mlen -= 1; - while ((mlen -= 32) >= 0) { - /* - * Add with carry 16 words and fold in the last - * carry by adding a 0 with carry. - * - * The early ADD(16) and the LOAD(32) are to load - * the next 2 cache lines in advance on 486's. The - * 486 has a penalty of 2 clock cycles for loading - * a cache line, plus whatever time the external - * memory takes to load the first word(s) addressed. - * These penalties are unavoidable. Subsequent - * accesses to a cache line being loaded (and to - * other external memory?) are delayed until the - * whole load finishes. These penalties are mostly - * avoided by not accessing external memory for - * 8 cycles after the ADD(16) and 12 cycles after - * the LOAD(32). The loop terminates when mlen - * is initially 33 (not 32) to guaranteed that - * the LOAD(32) is within bounds. - */ - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - LOAD(32); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - } - mlen += 32 + 1; - if (mlen >= 32) { - ADD(16); - ADDC(0); - ADDC(4); - ADDC(8); - ADDC(12); - ADDC(20); - ADDC(24); - ADDC(28); - MOP; - w += 16; - mlen -= 32; - } - if (mlen >= 16) { - ADD(0); - ADDC(4); - ADDC(8); - ADDC(12); - MOP; - w += 8; - mlen -= 16; - } - if (mlen >= 8) { - ADD(0); - ADDC(4); - MOP; - w += 4; - mlen -= 8; - } - if (mlen == 0 && byte_swapped == 0) - continue; /* worth 1% maybe ?? */ - REDUCE; - while ((mlen -= 2) >= 0) { - sum += *w++; - } - if (byte_swapped) { - sum <<= 8; - byte_swapped = 0; - if (mlen == -1) { - su.c[1] = *(char *)w; - sum += su.s; - mlen = 0; - } else - mlen = -1; - } else if (mlen == -1) - /* - * This mbuf has odd number of bytes. - * There could be a word split betwen - * this mbuf and the next mbuf. - * Save the last byte (to prepend to next mbuf). - */ - su.c[0] = *(char *)w; } + REDUCE16; + return (~sum & 0xffff); +} - if (len) - printf("%s: out of data by %d\n", __func__, len); - if (mlen == -1) { - /* The last mbuf has odd # of bytes. Follow the - standard (the odd byte is shifted left by 8 bits) */ - su.c[1] = 0; - sum += su.s; - } - REDUCE; - return (~sum & 0xffff); +u_int in_cksum_hdr(const struct ip *ip) +{ + u_int64_t sum = in_cksumdata(ip, sizeof(struct ip)); + union q_util q_util; + union l_util l_util; + REDUCE16; + return (~sum & 0xffff); }
Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200304080618.h386IRwB093608>