Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 23 Jun 2001 14:02:41 -0700 (PDT)
From:      Matt Dillon <dillon@earth.backplane.com>
To:        Mikhail Teterin <mi@aldan.algebra.com>, jlemon@FreeBSD.ORG, cvs-committers@FreeBSD.ORG, cvs-all@FreeBSD.ORG
Subject:   Inline optimized bzero (was Re: cvs commit: src/sys/netinet tcp_subr.c)
Message-ID:  <200106232102.f5NL2fY73920@earth.backplane.com>
References:  <200106231912.f5NJCUE01011@aldan.algebra.com> 

next in thread | previous in thread | raw e-mail | index | archive | help
    I would propose adding a new kernel bzero() function, called bzerol(),
    which is an inline integer-aligned implementation.

    This implementation should be called for integer-aligned buffers with
    known (constant) sizes, and generates about the same number of assembly
    instructions as calling bzero() eats.

    I did a quick perusal of the kernel code and an unbelievable number of
    bzero() calls could be converted.


Test1 - bcopy 20x2 bytes        204.90 nS/loop
Test2 - manual load data         26.61 nS/loop
Test3 - man load w/ptrs          36.38 nS/loop
Test4 - mlptrs & bzero          163.96 nS/loop
Test5 - mlptrszer & call        182.46 nS/loop
Test6 - mlptrszerc/bzerol        67.21 nS/loop
Test7 - bigbuf/libc-bzero       621.11 nS/loop
Test8 - bigbuf/bzerol           669.10 nS/loop

/*
 * MEMTEST.C
 */

#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>

#define LOOPS	1000000

struct DBuf {
    int	x[5];
    int	y[5];
    char notonsamecacheline[256];
} DBuf, Template, Template2, *GlobPtr = &Template2;

static void showtimes(struct timeval *t1, struct timeval *t2, const char *str, int loops);
static void test1(void);
static void test2(void);
static void test3(struct DBuf *template);
static void test4(struct DBuf *template);
static void test5(struct DBuf *template);
static void test6(struct DBuf *template);
static void test7(struct DBuf *template);
static void test8(struct DBuf *template);
static int simplecall(int a, int b, int c);
static void bzerol(void *s, int bytes);

char XBuf[1024];

/*
 * bzerol() - aligned bzero.  The buffer must be integer aligned and sized.
 *
 *	This routine should only be called with constant sizes, so GCC can
 *	optimize it.  This routine typically optimizes down to just a few
 *	instructions.
 */

static __inline void
bzerol(void *s, int bytes)
{
    assert((bytes & (sizeof(int) - 1)) == 0);

    switch(bytes) {
    case sizeof(int) * 5:
	*((int *)s + 4) = 0;
	/* fall through */
    case sizeof(int) * 4:
	*((int *)s + 3) = 0;
	/* fall through */
    case sizeof(int) * 3:
	*((int *)s + 2) = 0;
	/* fall through */
    case sizeof(int) * 2:
	*((int *)s + 1) = 0;
	/* fall through */
    case sizeof(int) * 1:
	*(int *)s = 0;
	/* fall through */
    case 0:
	return;
    default:
	if (bytes >= sizeof(int) * 8) {
	    while (bytes >= sizeof(int) * 4) {
		*(int *)((char *)s + 0 * sizeof(int)) = 0;
		*(int *)((char *)s + 1 * sizeof(int)) = 0;
		*(int *)((char *)s + 2 * sizeof(int)) = 0;
		*(int *)((char *)s + 3 * sizeof(int)) = 0;
		s = (char *)s + sizeof(int) * 4;
		bytes -= sizeof(int) * 4;
	    }
	}
	while (bytes > 0) {
	    bytes -= 4;
	    *(int *)((char *)s + bytes) = 0;
	}
    }
}


int
main(int ac, char **av)
{
    struct timeval tbeg;
    struct timeval tend;
    int i;

    test1();
    gettimeofday(&tend, NULL);
    gettimeofday(&tbeg, NULL);
    for (i = LOOPS; i; --i)
	test1();
    gettimeofday(&tend, NULL);
    showtimes(&tbeg, &tend, "Test1 - bcopy 20x2 bytes", LOOPS);

    test2();
    gettimeofday(&tend, NULL);
    gettimeofday(&tbeg, NULL);
    for (i = LOOPS; i; --i)
	test2();
    gettimeofday(&tend, NULL);
    showtimes(&tbeg, &tend, "Test2 - manual load data", LOOPS);

    test3(&Template);
    gettimeofday(&tend, NULL);
    gettimeofday(&tbeg, NULL);
    for (i = LOOPS; i; --i)
	test3(&Template);
    gettimeofday(&tend, NULL);
    showtimes(&tbeg, &tend, "Test3 - man load w/ptrs ", LOOPS);

    test4(&Template);
    gettimeofday(&tend, NULL);
    gettimeofday(&tbeg, NULL);
    for (i = LOOPS; i; --i)
	test4(&Template);
    gettimeofday(&tend, NULL);
    showtimes(&tbeg, &tend, "Test4 - mlptrs & bzero  ", LOOPS);

    test5(&Template);
    gettimeofday(&tend, NULL);
    gettimeofday(&tbeg, NULL);
    for (i = LOOPS; i; --i)
	test5(&Template);
    gettimeofday(&tend, NULL);
    showtimes(&tbeg, &tend, "Test5 - mlptrszer & call", LOOPS);

    test6(&Template);
    gettimeofday(&tend, NULL);
    gettimeofday(&tbeg, NULL);
    for (i = LOOPS; i; --i)
	test6(&Template);
    gettimeofday(&tend, NULL);
    showtimes(&tbeg, &tend, "Test6 - mlptrszerc/mybzero", LOOPS);

    test7(&Template);
    gettimeofday(&tend, NULL);
    gettimeofday(&tbeg, NULL);
    for (i = LOOPS; i; --i)
	test7(&Template);
    gettimeofday(&tend, NULL);
    showtimes(&tbeg, &tend, "Test7 - bigbuf/libc-bzero", LOOPS);

    test8(&Template);
    gettimeofday(&tend, NULL);
    gettimeofday(&tbeg, NULL);
    for (i = LOOPS; i; --i)
	test8(&Template);
    gettimeofday(&tend, NULL);
    showtimes(&tbeg, &tend, "Test8 - bigbuf/mybzero   ", LOOPS);

    return(0);
}

static void
showtimes(struct timeval *t1, struct timeval *t2, const char *str, int loops)
{
    long us;

    us = (t2->tv_usec + 1000000 - t1->tv_usec) + 
	    (t2->tv_sec - t1->tv_sec - 1) * 1000000;
    printf("%s\t%6.2f nS/loop\n", str, (double)us * 1000.0 / (double)loops);
}

static void
test1(void)
{
    bcopy(Template.x, DBuf.x, sizeof(DBuf.x));
    bcopy(Template.y, DBuf.y, sizeof(DBuf.y));
}

static void
test2(void)
{
    DBuf.x[0] = 0;
    DBuf.x[1] = 0;
    DBuf.x[2] = 0;
    DBuf.x[3] = 0;
    DBuf.x[4] = 0;

    DBuf.y[0] = 0;
    DBuf.y[1] = 0;
    DBuf.y[2] = 0;
    DBuf.y[3] = 0;
    DBuf.y[4] = 0;
}

static void
test3(struct DBuf *template)
{
    DBuf.x[0] = 0;
    DBuf.x[1] = GlobPtr->x[1];
    DBuf.x[2] = template->x[2];
    DBuf.x[3] = 0;
    DBuf.x[4] = 0;

    DBuf.y[0] = template->y[0];
    DBuf.y[1] = template->y[1];
    DBuf.y[2] = template->y[2];
    DBuf.y[3] = 5;
    DBuf.y[4] = 0;
}

static void
test4(struct DBuf *template)
{
    bzero(&DBuf.x, sizeof(DBuf.x));
    DBuf.x[0] = 0;
    DBuf.x[1] = GlobPtr->x[1];
    DBuf.x[2] = template->x[2];
    DBuf.x[3] = 0;
    DBuf.x[4] = 0;

    DBuf.y[0] = template->y[0];
    DBuf.y[1] = template->y[1];
    DBuf.y[2] = template->y[2];
    DBuf.y[3] = 5;
    DBuf.y[4] = 0;
}

static void
test5(struct DBuf *template)
{
    bzero(&DBuf.x, sizeof(DBuf.x));
    DBuf.x[0] = 0;
    DBuf.x[1] = GlobPtr->x[1];
    DBuf.x[2] = template->x[2];
    DBuf.x[3] = 0;
    DBuf.x[4] = 0;

    DBuf.y[0] = simplecall(1, 2, 3);
    DBuf.y[1] = template->y[1];
    DBuf.y[2] = template->y[2];
    DBuf.y[3] = 5;
    DBuf.y[4] = 0;
}

static void
test6(struct DBuf *template)
{
    bzerol(&DBuf.x, sizeof(DBuf.x));
    DBuf.x[0] = 0;
    DBuf.x[1] = GlobPtr->x[1];
    DBuf.x[2] = template->x[2];
    DBuf.x[3] = 0;
    DBuf.x[4] = 0;

    DBuf.y[0] = simplecall(1, 2, 3);
    DBuf.y[1] = template->y[1];
    DBuf.y[2] = template->y[2];
    DBuf.y[3] = 5;
    DBuf.y[4] = 0;
}

static void
test7(struct DBuf *template)
{
    bzero(XBuf, sizeof(XBuf));
    DBuf.x[0] = 0;
    DBuf.x[1] = GlobPtr->x[1];
    DBuf.x[2] = template->x[2];
    DBuf.x[3] = 0;
    DBuf.x[4] = 0;

    DBuf.y[0] = simplecall(1, 2, 3);
    DBuf.y[1] = template->y[1];
    DBuf.y[2] = template->y[2];
    DBuf.y[3] = 5;
    DBuf.y[4] = 0;
}

static void
test8(struct DBuf *template)
{
    bzerol(XBuf, sizeof(XBuf));
    DBuf.x[0] = 0;
    DBuf.x[1] = GlobPtr->x[1];
    DBuf.x[2] = template->x[2];
    DBuf.x[3] = 0;
    DBuf.x[4] = 0;

    DBuf.y[0] = simplecall(1, 2, 3);
    DBuf.y[1] = template->y[1];
    DBuf.y[2] = template->y[2];
    DBuf.y[3] = 5;
    DBuf.y[4] = 0;
}


static int
simplecall(int a, int b, int c)
{
    return(a + b + c);
}


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe cvs-all" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200106232102.f5NL2fY73920>