Skip site navigation (1)Skip section navigation (2)
Date:      Fri, 5 Jun 2009 19:22:47 +0000 (UTC)
From:      Luigi Rizzo <luigi@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r193532 - in head/sys: conf modules/dummynet modules/ipfw modules/ipfw_nat netinet netinet/ipfw
Message-ID:  <200906051922.n55JMlsj035701@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: luigi
Date: Fri Jun  5 19:22:47 2009
New Revision: 193532
URL: http://svn.freebsd.org/changeset/base/193532

Log:
  move kernel ipfw-related sources to a separate directory,
  adjust conf/files and modules' Makefiles accordingly.
  
  No code or ABI changes so this and most of previous related
  changes can be easily MFC'ed
  
  MFC after:	5 days

Added:
  head/sys/netinet/ipfw/
  head/sys/netinet/ipfw/ip_dummynet.c   (props changed)
     - copied unchanged from r193497, head/sys/netinet/ip_dummynet.c
  head/sys/netinet/ipfw/ip_fw2.c   (contents, props changed)
     - copied, changed from r193502, head/sys/netinet/ip_fw2.c
  head/sys/netinet/ipfw/ip_fw_nat.c   (props changed)
     - copied unchanged from r193492, head/sys/netinet/ip_fw_nat.c
  head/sys/netinet/ipfw/ip_fw_pfil.c   (props changed)
     - copied unchanged from r193502, head/sys/netinet/ip_fw_pfil.c
Deleted:
  head/sys/netinet/ip_dummynet.c
  head/sys/netinet/ip_fw2.c
  head/sys/netinet/ip_fw_nat.c
  head/sys/netinet/ip_fw_pfil.c
Modified:
  head/sys/conf/files
  head/sys/modules/dummynet/Makefile
  head/sys/modules/ipfw/Makefile
  head/sys/modules/ipfw_nat/Makefile

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Fri Jun  5 18:50:45 2009	(r193531)
+++ head/sys/conf/files	Fri Jun  5 19:22:47 2009	(r193532)
@@ -2334,14 +2334,14 @@ netinet/in_proto.c		optional inet \
 	compile-with "${NORMAL_C} -I$S/contrib/pf"
 netinet/in_rmx.c		optional inet
 netinet/ip_divert.c		optional ipdivert
-netinet/ip_dummynet.c		optional dummynet
+netinet/ipfw/ip_dummynet.c	optional dummynet
 netinet/ip_ecn.c		optional inet | inet6
 netinet/ip_encap.c		optional inet | inet6
 netinet/ip_fastfwd.c		optional inet
-netinet/ip_fw2.c		optional ipfirewall \
+netinet/ipfw/ip_fw2.c		optional ipfirewall \
 	compile-with "${NORMAL_C} -I$S/contrib/pf"
-netinet/ip_fw_pfil.c		optional ipfirewall
-netinet/ip_fw_nat.c		optional ipfirewall_nat
+netinet/ipfw/ip_fw_pfil.c	optional ipfirewall
+netinet/ipfw/ip_fw_nat.c	optional ipfirewall_nat
 netinet/ip_icmp.c		optional inet
 netinet/ip_input.c		optional inet
 netinet/ip_ipsec.c		optional ipsec

Modified: head/sys/modules/dummynet/Makefile
==============================================================================
--- head/sys/modules/dummynet/Makefile	Fri Jun  5 18:50:45 2009	(r193531)
+++ head/sys/modules/dummynet/Makefile	Fri Jun  5 19:22:47 2009	(r193532)
@@ -2,7 +2,7 @@
 
 .include <bsd.own.mk>
 
-.PATH:  ${.CURDIR}/../../netinet
+.PATH:  ${.CURDIR}/../../netinet/ipfw
 KMOD=   dummynet
 SRCS=   ip_dummynet.c
 SRCS+=	opt_inet6.h

Modified: head/sys/modules/ipfw/Makefile
==============================================================================
--- head/sys/modules/ipfw/Makefile	Fri Jun  5 18:50:45 2009	(r193531)
+++ head/sys/modules/ipfw/Makefile	Fri Jun  5 19:22:47 2009	(r193532)
@@ -2,7 +2,7 @@
 
 .include <bsd.own.mk>
 
-.PATH: ${.CURDIR}/../../netinet
+.PATH: ${.CURDIR}/../../netinet/ipfw
 
 KMOD=	ipfw
 SRCS=	ip_fw2.c ip_fw_pfil.c

Modified: head/sys/modules/ipfw_nat/Makefile
==============================================================================
--- head/sys/modules/ipfw_nat/Makefile	Fri Jun  5 18:50:45 2009	(r193531)
+++ head/sys/modules/ipfw_nat/Makefile	Fri Jun  5 19:22:47 2009	(r193532)
@@ -1,6 +1,6 @@
 # $FreeBSD$
 
-.PATH: ${.CURDIR}/../../netinet
+.PATH: ${.CURDIR}/../../netinet/ipfw
 
 KMOD=   ipfw_nat
 SRCS=   ip_fw_nat.c

Copied: head/sys/netinet/ipfw/ip_dummynet.c (from r193497, head/sys/netinet/ip_dummynet.c)
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/netinet/ipfw/ip_dummynet.c	Fri Jun  5 19:22:47 2009	(r193532, copy of r193497, head/sys/netinet/ip_dummynet.c)
@@ -0,0 +1,2371 @@
+/*-
+ * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
+ * Portions Copyright (c) 2000 Akamba Corp.
+ * All rights reserved
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#define	DUMMYNET_DEBUG
+
+#include "opt_inet6.h"
+
+/*
+ * This module implements IP dummynet, a bandwidth limiter/delay emulator
+ * used in conjunction with the ipfw package.
+ * Description of the data structures used is in ip_dummynet.h
+ * Here you mainly find the following blocks of code:
+ *  + variable declarations;
+ *  + heap management functions;
+ *  + scheduler and dummynet functions;
+ *  + configuration and initialization.
+ *
+ * NOTA BENE: critical sections are protected by the "dummynet lock".
+ *
+ * Most important Changes:
+ *
+ * 011004: KLDable
+ * 010124: Fixed WF2Q behaviour
+ * 010122: Fixed spl protection.
+ * 000601: WF2Q support
+ * 000106: large rewrite, use heaps to handle very many pipes.
+ * 980513:	initial release
+ *
+ * include files marked with XXX are probably not needed
+ */
+
+#include <sys/limits.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <net/if.h>	/* IFNAMSIZ, struct ifaddr, ifq head, lock.h mutex.h */
+#include <net/netisr.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>		/* ip_len, ip_off */
+#include <netinet/ip_fw.h>
+#include <netinet/ip_dummynet.h>
+#include <netinet/ip_var.h>	/* ip_output(), IP_FORWARDING */
+
+#include <netinet/if_ether.h> /* various ether_* routines */
+
+#include <netinet/ip6.h>       /* for ip6_input, ip6_output prototypes */
+#include <netinet6/ip6_var.h>
+
+/*
+ * We keep a private variable for the simulation time, but we could
+ * probably use an existing one ("softticks" in sys/kern/kern_timeout.c)
+ */
+static dn_key curr_time = 0 ; /* current simulation time */
+
+static int dn_hash_size = 64 ;	/* default hash size */
+
+/* statistics on number of queue searches and search steps */
+static long searches, search_steps ;
+static int pipe_expire = 1 ;   /* expire queue if empty */
+static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
+
+static long pipe_slot_limit = 100; /* Foot shooting limit for pipe queues. */
+static long pipe_byte_limit = 1024 * 1024;
+
+static int red_lookup_depth = 256;	/* RED - default lookup table depth */
+static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
+static int red_max_pkt_size = 1500;     /* RED - default max packet size */
+
+static struct timeval prev_t, t;
+static long tick_last;			/* Last tick duration (usec). */
+static long tick_delta;			/* Last vs standard tick diff (usec). */
+static long tick_delta_sum;		/* Accumulated tick difference (usec).*/
+static long tick_adjustment;		/* Tick adjustments done. */
+static long tick_lost;			/* Lost(coalesced) ticks number. */
+/* Adjusted vs non-adjusted curr_time difference (ticks). */
+static long tick_diff;
+
+static int		io_fast;
+static unsigned long	io_pkt;
+static unsigned long	io_pkt_fast;
+static unsigned long	io_pkt_drop;
+
+/*
+ * Three heaps contain queues and pipes that the scheduler handles:
+ *
+ * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
+ *
+ * wfq_ready_heap contains the pipes associated with WF2Q flows
+ *
+ * extract_heap contains pipes associated with delay lines.
+ *
+ */
+
+MALLOC_DEFINE(M_DUMMYNET, "dummynet", "dummynet heap");
+
+static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
+
+static int	heap_init(struct dn_heap *h, int size);
+static int	heap_insert (struct dn_heap *h, dn_key key1, void *p);
+static void	heap_extract(struct dn_heap *h, void *obj);
+static void	transmit_event(struct dn_pipe *pipe, struct mbuf **head,
+		    struct mbuf **tail);
+static void	ready_event(struct dn_flow_queue *q, struct mbuf **head,
+		    struct mbuf **tail);
+static void	ready_event_wfq(struct dn_pipe *p, struct mbuf **head,
+		    struct mbuf **tail);
+
+#define	HASHSIZE	16
+#define	HASH(num)	((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f)
+static struct dn_pipe_head	pipehash[HASHSIZE];	/* all pipes */
+static struct dn_flow_set_head	flowsethash[HASHSIZE];	/* all flowsets */
+
+static struct callout dn_timeout;
+
+extern	void (*bridge_dn_p)(struct mbuf *, struct ifnet *);
+
+#ifdef SYSCTL_NODE
+SYSCTL_DECL(_net_inet);
+SYSCTL_DECL(_net_inet_ip);
+
+SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, CTLFLAG_RW, 0, "Dummynet");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
+    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
+#if 0	/* curr_time is 64 bit */
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, curr_time,
+    CTLFLAG_RD, &curr_time, 0, "Current tick");
+#endif
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
+    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
+    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, searches,
+    CTLFLAG_RD, &searches, 0, "Number of queue searches");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, search_steps,
+    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
+    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
+    CTLFLAG_RW, &dn_max_ratio, 0,
+    "Max ratio between dynamic queues and buckets");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
+    CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
+    CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
+    CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta,
+    CTLFLAG_RD, &tick_delta, 0, "Last vs standard tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_delta_sum,
+    CTLFLAG_RD, &tick_delta_sum, 0, "Accumulated tick difference (usec).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_adjustment,
+    CTLFLAG_RD, &tick_adjustment, 0, "Tick adjustments done.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_diff,
+    CTLFLAG_RD, &tick_diff, 0,
+    "Adjusted vs non-adjusted curr_time difference (ticks).");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, tick_lost,
+    CTLFLAG_RD, &tick_lost, 0,
+    "Number of ticks coalesced by dummynet taskqueue.");
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, io_fast,
+    CTLFLAG_RW, &io_fast, 0, "Enable fast dummynet io.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt,
+    CTLFLAG_RD, &io_pkt, 0,
+    "Number of packets passed to dummynet.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_fast,
+    CTLFLAG_RD, &io_pkt_fast, 0,
+    "Number of packets bypassed dummynet scheduler.");
+SYSCTL_ULONG(_net_inet_ip_dummynet, OID_AUTO, io_pkt_drop,
+    CTLFLAG_RD, &io_pkt_drop, 0,
+    "Number of packets dropped by dummynet.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_slot_limit,
+    CTLFLAG_RW, &pipe_slot_limit, 0, "Upper limit in slots for pipe queue.");
+SYSCTL_LONG(_net_inet_ip_dummynet, OID_AUTO, pipe_byte_limit,
+    CTLFLAG_RW, &pipe_byte_limit, 0, "Upper limit in bytes for pipe queue.");
+#endif
+
+#ifdef DUMMYNET_DEBUG
+int	dummynet_debug = 0;
+#ifdef SYSCTL_NODE
+SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
+	    0, "control debugging printfs");
+#endif
+#define	DPRINTF(X)	if (dummynet_debug) printf X
+#else
+#define	DPRINTF(X)
+#endif
+
+static struct task	dn_task;
+static struct taskqueue	*dn_tq = NULL;
+static void dummynet_task(void *, int);
+
+static struct mtx dummynet_mtx;
+#define	DUMMYNET_LOCK_INIT() \
+	mtx_init(&dummynet_mtx, "dummynet", NULL, MTX_DEF)
+#define	DUMMYNET_LOCK_DESTROY()	mtx_destroy(&dummynet_mtx)
+#define	DUMMYNET_LOCK()		mtx_lock(&dummynet_mtx)
+#define	DUMMYNET_UNLOCK()	mtx_unlock(&dummynet_mtx)
+#define	DUMMYNET_LOCK_ASSERT()	mtx_assert(&dummynet_mtx, MA_OWNED)
+
+static int	config_pipe(struct dn_pipe *p);
+static int	ip_dn_ctl(struct sockopt *sopt);
+
+static void	dummynet(void *);
+static void	dummynet_flush(void);
+static void	dummynet_send(struct mbuf *);
+void		dummynet_drain(void);
+static int	dummynet_io(struct mbuf **, int , struct ip_fw_args *);
+static void	dn_rule_delete(void *);
+
+/*
+ * Heap management functions.
+ *
+ * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
+ * Some macros help finding parent/children so we can optimize them.
+ *
+ * heap_init() is called to expand the heap when needed.
+ * Increment size in blocks of 16 entries.
+ * XXX failure to allocate a new element is a pretty bad failure
+ * as we basically stall a whole queue forever!!
+ * Returns 1 on error, 0 on success
+ */
+#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
+#define HEAP_LEFT(x) ( 2*(x) + 1 )
+#define HEAP_IS_LEFT(x) ( (x) & 1 )
+#define HEAP_RIGHT(x) ( 2*(x) + 2 )
+#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
+#define HEAP_INCREMENT	15
+
+static int
+heap_init(struct dn_heap *h, int new_size)
+{
+    struct dn_heap_entry *p;
+
+    if (h->size >= new_size ) {
+	printf("dummynet: %s, Bogus call, have %d want %d\n", __func__,
+		h->size, new_size);
+	return 0 ;
+    }
+    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
+    p = malloc(new_size * sizeof(*p), M_DUMMYNET, M_NOWAIT);
+    if (p == NULL) {
+	printf("dummynet: %s, resize %d failed\n", __func__, new_size );
+	return 1 ; /* error */
+    }
+    if (h->size > 0) {
+	bcopy(h->p, p, h->size * sizeof(*p) );
+	free(h->p, M_DUMMYNET);
+    }
+    h->p = p ;
+    h->size = new_size ;
+    return 0 ;
+}
+
+/*
+ * Insert element in heap. Normally, p != NULL, we insert p in
+ * a new position and bubble up. If p == NULL, then the element is
+ * already in place, and key is the position where to start the
+ * bubble-up.
+ * Returns 1 on failure (cannot allocate new heap entry)
+ *
+ * If offset > 0 the position (index, int) of the element in the heap is
+ * also stored in the element itself at the given offset in bytes.
+ */
+#define SET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
+/*
+ * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
+ */
+#define RESET_OFFSET(heap, node) \
+    if (heap->offset > 0) \
+	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
+static int
+heap_insert(struct dn_heap *h, dn_key key1, void *p)
+{
+    int son = h->elements ;
+
+    if (p == NULL)	/* data already there, set starting point */
+	son = key1 ;
+    else {		/* insert new element at the end, possibly resize */
+	son = h->elements ;
+	if (son == h->size) /* need resize... */
+	    if (heap_init(h, h->elements+1) )
+		return 1 ; /* failure... */
+	h->p[son].object = p ;
+	h->p[son].key = key1 ;
+	h->elements++ ;
+    }
+    while (son > 0) {				/* bubble up */
+	int father = HEAP_FATHER(son) ;
+	struct dn_heap_entry tmp  ;
+
+	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
+	    break ; /* found right position */
+	/* son smaller than father, swap and repeat */
+	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
+	SET_OFFSET(h, son);
+	son = father ;
+    }
+    SET_OFFSET(h, son);
+    return 0 ;
+}
+
+/*
+ * remove top element from heap, or obj if obj != NULL
+ */
+static void
+heap_extract(struct dn_heap *h, void *obj)
+{
+    int child, father, max = h->elements - 1 ;
+
+    if (max < 0) {
+	printf("dummynet: warning, extract from empty heap 0x%p\n", h);
+	return ;
+    }
+    father = 0 ; /* default: move up smallest child */
+    if (obj != NULL) { /* extract specific element, index is at offset */
+	if (h->offset <= 0)
+	    panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
+	father = *((int *)((char *)obj + h->offset)) ;
+	if (father < 0 || father >= h->elements) {
+	    printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
+		father, h->elements);
+	    panic("dummynet: heap_extract");
+	}
+    }
+    RESET_OFFSET(h, father);
+    child = HEAP_LEFT(father) ;		/* left child */
+    while (child <= max) {		/* valid entry */
+	if (child != max && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
+	    child = child+1 ;		/* take right child, otherwise left */
+	h->p[father] = h->p[child] ;
+	SET_OFFSET(h, father);
+	father = child ;
+	child = HEAP_LEFT(child) ;   /* left child for next loop */
+    }
+    h->elements-- ;
+    if (father != max) {
+	/*
+	 * Fill hole with last entry and bubble up, reusing the insert code
+	 */
+	h->p[father] = h->p[max] ;
+	heap_insert(h, father, NULL); /* this one cannot fail */
+    }
+}
+
+#if 0
+/*
+ * change object position and update references
+ * XXX this one is never used!
+ */
+static void
+heap_move(struct dn_heap *h, dn_key new_key, void *object)
+{
+    int temp;
+    int i ;
+    int max = h->elements-1 ;
+    struct dn_heap_entry buf ;
+
+    if (h->offset <= 0)
+	panic("cannot move items on this heap");
+
+    i = *((int *)((char *)object + h->offset));
+    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
+	h->p[i].key = new_key ;
+	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
+		 i = temp ) { /* bubble up */
+	    HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+	    SET_OFFSET(h, i);
+	}
+    } else {		/* must move down */
+	h->p[i].key = new_key ;
+	while ( (temp = HEAP_LEFT(i)) <= max ) { /* found left child */
+	    if ((temp != max) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
+		temp++ ; /* select child with min key */
+	    if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
+		HEAP_SWAP(h->p[i], h->p[temp], buf) ;
+		SET_OFFSET(h, i);
+	    } else
+		break ;
+	    i = temp ;
+	}
+    }
+    SET_OFFSET(h, i);
+}
+#endif /* heap_move, unused */
+
+/*
+ * heapify() will reorganize data inside an array to maintain the
+ * heap property. It is needed when we delete a bunch of entries.
+ */
+static void
+heapify(struct dn_heap *h)
+{
+    int i ;
+
+    for (i = 0 ; i < h->elements ; i++ )
+	heap_insert(h, i , NULL) ;
+}
+
+/*
+ * cleanup the heap and free data structure
+ */
+static void
+heap_free(struct dn_heap *h)
+{
+    if (h->size >0 )
+	free(h->p, M_DUMMYNET);
+    bzero(h, sizeof(*h) );
+}
+
+/*
+ * --- end of heap management functions ---
+ */
+
+/*
+ * Return the mbuf tag holding the dummynet state.  As an optimization
+ * this is assumed to be the first tag on the list.  If this turns out
+ * wrong we'll need to search the list.
+ */
+static struct dn_pkt_tag *
+dn_tag_get(struct mbuf *m)
+{
+    struct m_tag *mtag = m_tag_first(m);
+    KASSERT(mtag != NULL &&
+	    mtag->m_tag_cookie == MTAG_ABI_COMPAT &&
+	    mtag->m_tag_id == PACKET_TAG_DUMMYNET,
+	    ("packet on dummynet queue w/o dummynet tag!"));
+    return (struct dn_pkt_tag *)(mtag+1);
+}
+
+/*
+ * Scheduler functions:
+ *
+ * transmit_event() is called when the delay-line needs to enter
+ * the scheduler, either because of existing pkts getting ready,
+ * or new packets entering the queue. The event handled is the delivery
+ * time of the packet.
+ *
+ * ready_event() does something similar with fixed-rate queues, and the
+ * event handled is the finish time of the head pkt.
+ *
+ * wfq_ready_event() does something similar with WF2Q queues, and the
+ * event handled is the start time of the head pkt.
+ *
+ * In all cases, we make sure that the data structures are consistent
+ * before passing pkts out, because this might trigger recursive
+ * invocations of the procedures.
+ */
+static void
+transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail)
+{
+	struct mbuf *m;
+	struct dn_pkt_tag *pkt;
+
+	DUMMYNET_LOCK_ASSERT();
+
+	while ((m = pipe->head) != NULL) {
+		pkt = dn_tag_get(m);
+		if (!DN_KEY_LEQ(pkt->output_time, curr_time))
+			break;
+
+		pipe->head = m->m_nextpkt;
+		if (*tail != NULL)
+			(*tail)->m_nextpkt = m;
+		else
+			*head = m;
+		*tail = m;
+	}
+	if (*tail != NULL)
+		(*tail)->m_nextpkt = NULL;
+
+	/* If there are leftover packets, put into the heap for next event. */
+	if ((m = pipe->head) != NULL) {
+		pkt = dn_tag_get(m);
+		/*
+		 * XXX Should check errors on heap_insert, by draining the
+		 * whole pipe p and hoping in the future we are more successful.
+		 */
+		heap_insert(&extract_heap, pkt->output_time, pipe);
+	}
+}
+
+#define div64(a, b)	((int64_t)(a) / (int64_t)(b))
+#define DN_TO_DROP	0xffff
+/*
+ * Compute how many ticks we have to wait before being able to send
+ * a packet. This is computed as the "wire time" for the packet
+ * (length + extra bits), minus the credit available, scaled to ticks.
+ * Check that the result is not be negative (it could be if we have
+ * too much leftover credit in q->numbytes).
+ */
+static inline dn_key
+set_ticks(struct mbuf *m, struct dn_flow_queue *q, struct dn_pipe *p)
+{
+	int64_t ret;
+
+	ret = div64( (m->m_pkthdr.len * 8 + q->extra_bits) * hz
+		- q->numbytes + p->bandwidth - 1 , p->bandwidth);
+#if 0
+	printf("%s %d extra_bits %d numb %d ret %d\n",
+		__FUNCTION__, __LINE__,
+		(int)(q->extra_bits & 0xffffffff),
+		(int)(q->numbytes & 0xffffffff),
+		(int)(ret & 0xffffffff));
+#endif
+	if (ret < 0)
+		ret = 0;
+	return ret;
+}
+
+/*
+ * Convert the additional MAC overheads/delays into an equivalent
+ * number of bits for the given data rate. The samples are in milliseconds
+ * so we need to divide by 1000.
+ */
+static dn_key
+compute_extra_bits(struct mbuf *pkt, struct dn_pipe *p)
+{
+	int index;
+	dn_key extra_bits;
+
+	if (!p->samples || p->samples_no == 0)
+		return 0;
+	index  = random() % p->samples_no;
+	extra_bits = ((dn_key)p->samples[index] * p->bandwidth) / 1000;
+	if (index >= p->loss_level) {
+		struct dn_pkt_tag *dt = dn_tag_get(pkt);
+		if (dt)
+			dt->dn_dir = DN_TO_DROP;
+	}
+	return extra_bits;
+}
+
+static void
+free_pipe(struct dn_pipe *p)
+{
+	if (p->samples)
+		free(p->samples, M_DUMMYNET);
+	free(p, M_DUMMYNET);
+}
+
+/*
+ * extract pkt from queue, compute output time (could be now)
+ * and put into delay line (p_queue)
+ */
+static void
+move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, struct dn_pipe *p,
+    int len)
+{
+    struct dn_pkt_tag *dt = dn_tag_get(pkt);
+
+    q->head = pkt->m_nextpkt ;
+    q->len-- ;
+    q->len_bytes -= len ;
+
+    dt->output_time = curr_time + p->delay ;
+
+    if (p->head == NULL)
+	p->head = pkt;
+    else
+	p->tail->m_nextpkt = pkt;
+    p->tail = pkt;
+    p->tail->m_nextpkt = NULL;
+}
+
+/*
+ * ready_event() is invoked every time the queue must enter the
+ * scheduler, either because the first packet arrives, or because
+ * a previously scheduled event fired.
+ * On invokation, drain as many pkts as possible (could be 0) and then
+ * if there are leftover packets reinsert the pkt in the scheduler.
+ */
+static void
+ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail)
+{
+	struct mbuf *pkt;
+	struct dn_pipe *p = q->fs->pipe;
+	int p_was_empty;
+
+	DUMMYNET_LOCK_ASSERT();
+
+	if (p == NULL) {
+		printf("dummynet: ready_event- pipe is gone\n");
+		return;
+	}
+	p_was_empty = (p->head == NULL);
+
+	/*
+	 * Schedule fixed-rate queues linked to this pipe:
+	 * account for the bw accumulated since last scheduling, then
+	 * drain as many pkts as allowed by q->numbytes and move to
+	 * the delay line (in p) computing output time.
+	 * bandwidth==0 (no limit) means we can drain the whole queue,
+	 * setting len_scaled = 0 does the job.
+	 */
+	q->numbytes += (curr_time - q->sched_time) * p->bandwidth;
+	while ((pkt = q->head) != NULL) {
+		int len = pkt->m_pkthdr.len;
+		dn_key len_scaled = p->bandwidth ? len*8*hz
+			+ q->extra_bits*hz
+			: 0;
+
+		if (DN_KEY_GT(len_scaled, q->numbytes))
+			break;
+		q->numbytes -= len_scaled;
+		move_pkt(pkt, q, p, len);
+		if (q->head)
+			q->extra_bits = compute_extra_bits(q->head, p);
+	}
+	/*
+	 * If we have more packets queued, schedule next ready event
+	 * (can only occur when bandwidth != 0, otherwise we would have
+	 * flushed the whole queue in the previous loop).
+	 * To this purpose we record the current time and compute how many
+	 * ticks to go for the finish time of the packet.
+	 */
+	if ((pkt = q->head) != NULL) {	/* this implies bandwidth != 0 */
+		dn_key t = set_ticks(pkt, q, p); /* ticks i have to wait */
+
+		q->sched_time = curr_time;
+		heap_insert(&ready_heap, curr_time + t, (void *)q);
+		/*
+		 * XXX Should check errors on heap_insert, and drain the whole
+		 * queue on error hoping next time we are luckier.
+		 */
+	} else		/* RED needs to know when the queue becomes empty. */
+		q->q_time = curr_time;
+
+	/*
+	 * If the delay line was empty call transmit_event() now.
+	 * Otherwise, the scheduler will take care of it.
+	 */
+	if (p_was_empty)
+		transmit_event(p, head, tail);
+}
+
+/*
+ * Called when we can transmit packets on WF2Q queues. Take pkts out of
+ * the queues at their start time, and enqueue into the delay line.
+ * Packets are drained until p->numbytes < 0. As long as
+ * len_scaled >= p->numbytes, the packet goes into the delay line
+ * with a deadline p->delay. For the last packet, if p->numbytes < 0,
+ * there is an additional delay.
+ */
+static void
+ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail)
+{
+	int p_was_empty = (p->head == NULL);
+	struct dn_heap *sch = &(p->scheduler_heap);
+	struct dn_heap *neh = &(p->not_eligible_heap);
+	int64_t p_numbytes = p->numbytes;
+
+	DUMMYNET_LOCK_ASSERT();
+
+	if (p->if_name[0] == 0)		/* tx clock is simulated */
+		/*
+		 * Since result may not fit into p->numbytes (32bit) we
+		 * are using 64bit var here.
+		 */
+		p_numbytes += (curr_time - p->sched_time) * p->bandwidth;
+	else {	/*
+		 * tx clock is for real,
+		 * the ifq must be empty or this is a NOP.
+		 */
+		if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
+			return;
+		else {
+			DPRINTF(("dummynet: pipe %d ready from %s --\n",
+			    p->pipe_nr, p->if_name));
+		}
+	}
+
+	/*
+	 * While we have backlogged traffic AND credit, we need to do
+	 * something on the queue.
+	 */
+	while (p_numbytes >= 0 && (sch->elements > 0 || neh->elements > 0)) {
+		if (sch->elements > 0) {
+			/* Have some eligible pkts to send out. */
+			struct dn_flow_queue *q = sch->p[0].object;
+			struct mbuf *pkt = q->head;
+			struct dn_flow_set *fs = q->fs;
+			uint64_t len = pkt->m_pkthdr.len;
+			int len_scaled = p->bandwidth ? len * 8 * hz : 0;
+
+			heap_extract(sch, NULL); /* Remove queue from heap. */
+			p_numbytes -= len_scaled;
+			move_pkt(pkt, q, p, len);
+
+			p->V += (len << MY_M) / p->sum;	/* Update V. */
+			q->S = q->F;			/* Update start time. */
+			if (q->len == 0) {
+				/* Flow not backlogged any more. */
+				fs->backlogged--;
+				heap_insert(&(p->idle_heap), q->F, q);
+			} else {
+				/* Still backlogged. */
+
+				/*
+				 * Update F and position in backlogged queue,
+				 * then put flow in not_eligible_heap
+				 * (we will fix this later).
+				 */
+				len = (q->head)->m_pkthdr.len;
+				q->F += (len << MY_M) / (uint64_t)fs->weight;
+				if (DN_KEY_LEQ(q->S, p->V))
+					heap_insert(neh, q->S, q);
+				else
+					heap_insert(sch, q->F, q);
+			}
+		}
+		/*
+		 * Now compute V = max(V, min(S_i)). Remember that all elements
+		 * in sch have by definition S_i <= V so if sch is not empty,
+		 * V is surely the max and we must not update it. Conversely,
+		 * if sch is empty we only need to look at neh.
+		 */
+		if (sch->elements == 0 && neh->elements > 0)
+			p->V = MAX64(p->V, neh->p[0].key);
+		/* Move from neh to sch any packets that have become eligible */
+		while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V)) {
+			struct dn_flow_queue *q = neh->p[0].object;
+			heap_extract(neh, NULL);
+			heap_insert(sch, q->F, q);
+		}
+
+		if (p->if_name[0] != '\0') { /* Tx clock is from a real thing */
+			p_numbytes = -1;	/* Mark not ready for I/O. */
+			break;
+		}
+	}
+	if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0 &&
+	    p->idle_heap.elements > 0) {
+		/*
+		 * No traffic and no events scheduled.
+		 * We can get rid of idle-heap.
+		 */
+		int i;
+
+		for (i = 0; i < p->idle_heap.elements; i++) {
+			struct dn_flow_queue *q = p->idle_heap.p[i].object;
+
+			q->F = 0;
+			q->S = q->F + 1;
+		}
+		p->sum = 0;
+		p->V = 0;
+		p->idle_heap.elements = 0;
+	}
+	/*
+	 * If we are getting clocks from dummynet (not a real interface) and
+	 * If we are under credit, schedule the next ready event.
+	 * Also fix the delivery time of the last packet.
+	 */
+	if (p->if_name[0]==0 && p_numbytes < 0) { /* This implies bw > 0. */
+		dn_key t = 0;		/* Number of ticks i have to wait. */
+
+		if (p->bandwidth > 0)
+			t = (p->bandwidth - 1 - p_numbytes) / p->bandwidth;
+		dn_tag_get(p->tail)->output_time += t;
+		p->sched_time = curr_time;
+		heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
+		/*
+		 * XXX Should check errors on heap_insert, and drain the whole
+		 * queue on error hoping next time we are luckier.
+		 */
+	}
+
+	/* Fit (adjust if necessary) 64bit result into 32bit variable. */
+	if (p_numbytes > INT_MAX)
+		p->numbytes = INT_MAX;
+	else if (p_numbytes < INT_MIN)
+		p->numbytes = INT_MIN;
+	else
+		p->numbytes = p_numbytes;
+
+	/*
+	 * If the delay line was empty call transmit_event() now.
+	 * Otherwise, the scheduler will take care of it.
+	 */
+	if (p_was_empty)
+		transmit_event(p, head, tail);
+}
+
+/*
+ * This is called one tick, after previous run. It is used to
+ * schedule next run.
+ */
+static void
+dummynet(void * __unused unused)
+{
+
+	taskqueue_enqueue(dn_tq, &dn_task);
+}
+
+/*
+ * The main dummynet processing function.
+ */
+static void
+dummynet_task(void *context, int pending)
+{
+	struct mbuf *head = NULL, *tail = NULL;
+	struct dn_pipe *pipe;
+	struct dn_heap *heaps[3];
+	struct dn_heap *h;
+	void *p;	/* generic parameter to handler */
+	int i;
+
+	DUMMYNET_LOCK();
+
+	heaps[0] = &ready_heap;			/* fixed-rate queues */
+	heaps[1] = &wfq_ready_heap;		/* wfq queues */
+	heaps[2] = &extract_heap;		/* delay line */
+
+ 	/* Update number of lost(coalesced) ticks. */
+ 	tick_lost += pending - 1;
+ 
+ 	getmicrouptime(&t);
+ 	/* Last tick duration (usec). */
+ 	tick_last = (t.tv_sec - prev_t.tv_sec) * 1000000 +
+ 	    (t.tv_usec - prev_t.tv_usec);
+ 	/* Last tick vs standard tick difference (usec). */
+ 	tick_delta = (tick_last * hz - 1000000) / hz;
+ 	/* Accumulated tick difference (usec). */
+ 	tick_delta_sum += tick_delta;
+ 
+ 	prev_t = t;
+ 
+ 	/*
+ 	 * Adjust curr_time if accumulated tick difference greater than
+ 	 * 'standard' tick. Since curr_time should be monotonically increasing,
+ 	 * we do positive adjustment as required and throttle curr_time in
+ 	 * case of negative adjustment.
+ 	 */
+  	curr_time++;
+ 	if (tick_delta_sum - tick >= 0) {
+ 		int diff = tick_delta_sum / tick;
+ 
+ 		curr_time += diff;
+ 		tick_diff += diff;
+ 		tick_delta_sum %= tick;
+ 		tick_adjustment++;
+ 	} else if (tick_delta_sum + tick <= 0) {
+ 		curr_time--;
+ 		tick_diff--;
+ 		tick_delta_sum += tick;
+ 		tick_adjustment++;
+ 	}
+
+	for (i = 0; i < 3; i++) {
+		h = heaps[i];
+		while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time)) {
+			if (h->p[0].key > curr_time)
+				printf("dummynet: warning, "
+				    "heap %d is %d ticks late\n",
+				    i, (int)(curr_time - h->p[0].key));
+			/* store a copy before heap_extract */
+			p = h->p[0].object;
+			/* need to extract before processing */
+			heap_extract(h, NULL);
+			if (i == 0)
+				ready_event(p, &head, &tail);
+			else if (i == 1) {
+				struct dn_pipe *pipe = p;
+				if (pipe->if_name[0] != '\0')
+					printf("dummynet: bad ready_event_wfq "
+					    "for pipe %s\n", pipe->if_name);
+				else
+					ready_event_wfq(p, &head, &tail);
+			} else
+				transmit_event(p, &head, &tail);
+		}
+	}
+
+	/* Sweep pipes trying to expire idle flow_queues. */
+	for (i = 0; i < HASHSIZE; i++)
+		SLIST_FOREACH(pipe, &pipehash[i], next)
+			if (pipe->idle_heap.elements > 0 &&
+			    DN_KEY_LT(pipe->idle_heap.p[0].key, pipe->V)) {
+				struct dn_flow_queue *q =
+				    pipe->idle_heap.p[0].object;
+
+				heap_extract(&(pipe->idle_heap), NULL);
+				/* Mark timestamp as invalid. */
+				q->S = q->F + 1;
+				pipe->sum -= q->fs->weight;
+			}
+
+	DUMMYNET_UNLOCK();
+
+	if (head != NULL)
+		dummynet_send(head);
+
+	callout_reset(&dn_timeout, 1, dummynet, NULL);
+}
+
+static void
+dummynet_send(struct mbuf *m)
+{
+	struct dn_pkt_tag *pkt;
+	struct mbuf *n;
+	struct ip *ip;
+
+	for (; m != NULL; m = n) {
+		n = m->m_nextpkt;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?200906051922.n55JMlsj035701>