Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 2 Apr 2012 16:13:55 +0000 (UTC)
From:      Gleb Smirnoff <glebius@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-projects@freebsd.org
Subject:   svn commit: r233782 - projects/pf/head/sys/contrib/pf/net
Message-ID:  <201204021613.q32GDtGw005249@svn.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: glebius
Date: Mon Apr  2 16:13:54 2012
New Revision: 233782
URL: http://svn.freebsd.org/changeset/base/233782

Log:
  Major step in making pf more SMP friendly. Lots of changes, that
  depend on each other, thus this commit couldn't be effectively
  split.
  
  - The RB-tree for states dropped, a hash introduced instead.
    Locking is per hash slot. Hash function is cheap here, it
    is (id % hashsize), thus slot mutex is also used to lock all
    states inside the slot.
  - The RB-tree for state keys dropped, a hash introduced instead.
    Locking is per hash slot.
    Lock order is "key hash lock", then "id hash lock".
  - The global list of states dropped. Traversing ID hash slots used
    instead.
  - Introduce state referencing. ID hash, pfsync, keys, each give
    state a reference.
  - pf_find_state(), pf_find_state_byid() return locked on success.
  - pf_unlink_state() actually frees the state, if we got the last
    reference.
  - pf_purge_expired_states() processes fraction of ID hash each
    second. Black magic with sx(9) lock removed.
  - pfsync_state_in_use() axed, generic referencing used.
  - The key-to-key pointer sk->reverse is temporarily disabled. Enabling
    it introduces LORs difficult to deal with. Also disabling it removes
    the m_addr_changed() hack from the TCP/IP stack. Re-introducing of
    this optimization should be reconsidered later.
  - Start on better locking in pfioctl(): acquire PF_LOCK() separately
    for each command, so that we can unlock them one by one, as well
    as work on using M_WAITOK and don't interlock for copyout().
    For now the only polished command is DIOCGETSTATES.
  - Move some pf(4) initialization from pf_ioctl.c to pf.c, making
    them static. I believe only the ioctl stuff should belong to
    pf_ioctl.c.
  
  And probably lots of smaller changes not directly related to the
  above idea. I'm sorry.
  
  This code hasn't been properly tested. I just feed my test box
  via pfsync(4) with some live data, while the port on switch is
  in monitor mode, and monitors live pfsync vlan. The test box isn't
  forwarding anything, except its own traffic.
  
  The entire pf is still running under "pf Giant lock", since
  there are a lot of other things that should be locked before
  we can remove it.

Modified:
  projects/pf/head/sys/contrib/pf/net/if_pfsync.c
  projects/pf/head/sys/contrib/pf/net/pf.c
  projects/pf/head/sys/contrib/pf/net/pf_ioctl.c
  projects/pf/head/sys/contrib/pf/net/pf_mtag.h
  projects/pf/head/sys/contrib/pf/net/pfvar.h

Modified: projects/pf/head/sys/contrib/pf/net/if_pfsync.c
==============================================================================
--- projects/pf/head/sys/contrib/pf/net/if_pfsync.c	Mon Apr  2 15:07:22 2012	(r233781)
+++ projects/pf/head/sys/contrib/pf/net/if_pfsync.c	Mon Apr  2 16:13:54 2012	(r233782)
@@ -230,8 +230,8 @@ struct pfsync_softc {
 	struct callout		 sc_bulkfail_tmo;
 
 	u_int32_t		 sc_ureq_received;
-	struct pf_state		*sc_bulk_next;
-	struct pf_state		*sc_bulk_last;
+	int			 sc_bulk_hash_id;
+	struct pf_state_cmp	 sc_bulk_state;
 	struct callout		 sc_bulk_tmo;
 
 	TAILQ_HEAD(, tdb)	 sc_tdb_q;
@@ -256,8 +256,6 @@ static int	pfsync_init(void);
 static void	pfsync_uninit(void);
 static void	pfsync_sendout1(int);
 
-#define	schednetisr(NETISR_PFSYNC)	swi_sched(V_pfsync_swi_cookie, 0)
-
 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
 SYSCTL_VNET_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_RW,
     &VNET_NAME(pfsyncstats), pfsyncstats,
@@ -558,14 +556,15 @@ pfsync_state_import(struct pfsync_state 
 		goto cleanup_state;
 	}
 
-	if (!ISSET(flags, PFSYNC_SI_IOCTL)) {
-		CLR(st->state_flags, PFSTATE_NOSYNC);
-		if (ISSET(st->state_flags, PFSTATE_ACK)) {
+	if (!(flags & PFSYNC_SI_IOCTL)) {
+		st->state_flags &= ~PFSTATE_NOSYNC;
+		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_q_ins(st, PFSYNC_S_IACK);
-			schednetisr(NETISR_PFSYNC);
+			swi_sched(V_pfsync_swi_cookie, 0);
 		}
 	}
-	CLR(st->state_flags, PFSTATE_ACK);
+	st->state_flags &= ~PFSTATE_ACK;
+	PF_STATE_UNLOCK(st);
 
 	return (0);
 
@@ -578,7 +577,7 @@ cleanup:
 	if (sks != NULL)
 		uma_zfree(V_pf_state_key_z, sks);
 
-cleanup_state:	/* pf_state_insert frees the state keys */
+cleanup_state:	/* pf_state_insert() frees the state keys. */
 	if (st) {
 		if (st->dst.scrub)
 			uma_zfree(V_pf_state_scrub_z, st->dst.scrub);
@@ -679,8 +678,6 @@ pfsync_in_clr(struct pfsync_pkt *pkt, st
 	struct mbuf *mp;
 	int len = sizeof(*clr) * count;
 	int i, offp;
-
-	struct pf_state *st, *nexts;
 	u_int32_t creatorid;
 
 	mp = m_pulldown(m, offset, len, &offp);
@@ -698,18 +695,20 @@ pfsync_in_clr(struct pfsync_pkt *pkt, st
 		    pfi_kif_get(clr[i].ifname) == NULL)
 			continue;
 
-		PF_KEYS_LOCK();
-		PF_IDS_LOCK();
-		for (st = RB_MIN(pf_state_tree_id, &V_tree_id);
-		    st; st = nexts) {
-			nexts = RB_NEXT(pf_state_tree_id, &V_tree_id, st);
-			if (st->creatorid == creatorid) {
-				SET(st->state_flags, PFSTATE_NOSYNC);
-				pf_unlink_state(st, 1);
+		for (int i = 0; i <= V_pf_hashmask; i++) {
+			struct pf_idhash *ih = &V_pf_idhash[i];
+			struct pf_state *s;
+relock:
+			PF_HASHROW_LOCK(ih);
+			LIST_FOREACH(s, &ih->states, entry) {
+				if (s->creatorid == creatorid) {
+					s->state_flags |= PFSTATE_NOSYNC;
+					pf_unlink_state(s, PF_ENTER_LOCKED);
+					goto relock;
+				}
 			}
+			PF_HASHROW_UNLOCK(ih);
 		}
-		PF_IDS_UNLOCK();
-		PF_KEYS_UNLOCK();
 	}
 	PF_UNLOCK();
 
@@ -788,8 +787,9 @@ pfsync_in_iack(struct pfsync_pkt *pkt, s
 		if (st == NULL)
 			continue;
 
-		if (ISSET(st->state_flags, PFSTATE_ACK))
+		if (st->state_flags & PFSTATE_ACK)
 			pfsync_deferred(st, 0);
+		PF_STATE_UNLOCK(st);
 	}
 	PF_UNLOCK();
 	/*
@@ -881,7 +881,7 @@ pfsync_in_upd(struct pfsync_pkt *pkt, st
 			continue;
 		}
 
-		if (ISSET(st->state_flags, PFSTATE_ACK))
+		if (st->state_flags & PFSTATE_ACK)
 			pfsync_deferred(st, 1);
 
 		sk = st->key[PF_SK_WIRE];	/* XXX right one? */
@@ -910,7 +910,8 @@ pfsync_in_upd(struct pfsync_pkt *pkt, st
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
-			schednetisr(NETISR_PFSYNC);
+			PF_STATE_UNLOCK(st);
+			swi_sched(V_pfsync_swi_cookie, 0);
 			continue;
 		}
 		pfsync_alloc_scrub_memory(&sp->dst, &st->dst);
@@ -919,6 +920,7 @@ pfsync_in_upd(struct pfsync_pkt *pkt, st
 		st->expire = ntohl(sp->expire) + time_second;
 		st->timeout = sp->timeout;
 		st->pfsync_time = time_uptime;
+		PF_STATE_UNLOCK(st);
 	}
 	PF_UNLOCK();
 
@@ -973,7 +975,7 @@ pfsync_in_upd_c(struct pfsync_pkt *pkt, 
 			continue;
 		}
 
-		if (ISSET(st->state_flags, PFSTATE_ACK))
+		if (st->state_flags & PFSTATE_ACK)
 			pfsync_deferred(st, 1);
 
 		sk = st->key[PF_SK_WIRE]; /* XXX right one? */
@@ -1001,7 +1003,8 @@ pfsync_in_upd_c(struct pfsync_pkt *pkt, 
 			V_pfsyncstats.pfsyncs_stale++;
 
 			pfsync_update_state(st);
-			schednetisr(NETISR_PFSYNC);
+			PF_STATE_UNLOCK(st);
+			swi_sched(V_pfsync_swi_cookie, 0);
 			continue;
 		}
 		pfsync_alloc_scrub_memory(&up->dst, &st->dst);
@@ -1010,6 +1013,7 @@ pfsync_in_upd_c(struct pfsync_pkt *pkt, 
 		st->expire = ntohl(up->expire) + time_second;
 		st->timeout = up->timeout;
 		st->pfsync_time = time_uptime;
+		PF_STATE_UNLOCK(st);
 	}
 	PF_UNLOCK();
 
@@ -1049,10 +1053,13 @@ pfsync_in_ureq(struct pfsync_pkt *pkt, s
 				V_pfsyncstats.pfsyncs_badstate++;
 				continue;
 			}
-			if (ISSET(st->state_flags, PFSTATE_NOSYNC))
+			if (st->state_flags & PFSTATE_NOSYNC) {
+				PF_STATE_UNLOCK(st);
 				continue;
+			}
 
 			pfsync_update_state_req(st);
+			PF_STATE_UNLOCK(st);
 		}
 	}
 	PF_UNLOCK();
@@ -1089,8 +1096,8 @@ pfsync_in_del(struct pfsync_pkt *pkt, st
 			V_pfsyncstats.pfsyncs_badstate++;
 			continue;
 		}
-		SET(st->state_flags, PFSTATE_NOSYNC);
-		pf_unlink_state(st, 0);
+		st->state_flags |= PFSTATE_NOSYNC;
+		pf_unlink_state(st, PF_ENTER_LOCKED);
 	}
 	PF_UNLOCK();
 
@@ -1127,10 +1134,8 @@ pfsync_in_del_c(struct pfsync_pkt *pkt, 
 			continue;
 		}
 
-		SET(st->state_flags, PFSTATE_NOSYNC);
-		PF_KEYS_LOCK();
-		pf_unlink_state(st, 0);
-		PF_KEYS_UNLOCK();
+		st->state_flags |= PFSTATE_NOSYNC;
+		pf_unlink_state(st, PF_ENTER_LOCKED);
 	}
 	PF_UNLOCK();
 
@@ -1499,6 +1504,7 @@ pfsync_drop(struct pfsync_softc *sc)
 					__func__));
 #endif
 			st->sync_state = PFSYNC_S_NONE;
+			pf_release_state(st);
 		}
 		TAILQ_INIT(&sc->sc_qs[q]);
 	}
@@ -1539,7 +1545,7 @@ pfsync_sendout1(int schedswi)
 	struct ip *ip;
 	struct pfsync_header *ph;
 	struct pfsync_subheader *subh;
-	struct pf_state *st;
+	struct pf_state *st, *next;
 	struct pfsync_upd_req_item *ur;
 #ifdef notyet
 	struct tdb *t;
@@ -1596,15 +1602,13 @@ pfsync_sendout1(int schedswi)
 		offset += sizeof(*subh);
 
 		count = 0;
-		TAILQ_FOREACH(st, &sc->sc_qs[q], sync_list) {
-#ifdef PFSYNC_DEBUG
+		TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
 			KASSERT(st->sync_state == q,
 				("%s: st->sync_state == q",
 					__func__));
-#endif
-
 			offset += pfsync_qs[q].write(st, m, offset);
 			st->sync_state = PFSYNC_S_NONE;
+			pf_release_state(st);
 			count++;
 		}
 		TAILQ_INIT(&sc->sc_qs[q]);
@@ -1720,10 +1724,8 @@ pfsync_insert_state(struct pf_state *st)
 	if (sc == NULL || ISSET(st->state_flags, PFSTATE_NOSYNC))
 		return;
 
-#ifdef PFSYNC_DEBUG
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state == PFSYNC_S_NONE", __func__));
-#endif
 
 	if (sc->sc_len == PFSYNC_MINPKT)
 		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout,
@@ -1731,8 +1733,8 @@ pfsync_insert_state(struct pf_state *st)
 
 	pfsync_q_ins(st, PFSYNC_S_INS);
 
-	if (ISSET(st->state_flags, PFSTATE_ACK))
-		schednetisr(NETISR_PFSYNC);
+	if (st->state_flags & PFSTATE_ACK)
+		swi_sched(V_pfsync_swi_cookie, 0);
 	else
 		st->sync_updates = 0;
 }
@@ -1760,6 +1762,7 @@ pfsync_defer(struct pf_state *st, struct
 
 	pd->pd_st = st;
 	pd->pd_m = m;
+	pf_ref_state(st);
 
 	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
 	callout_init(&pd->pd_tmo, CALLOUT_MPSAFE);
@@ -1780,6 +1783,7 @@ pfsync_undefer(struct pfsync_deferral *p
 	sc->sc_deferred--;
 
 	CLR(pd->pd_st->state_flags, PFSTATE_ACK);
+	pf_release_state(pd->pd_st);
 	callout_stop(&pd->pd_tmo); /* bah */
 	if (drop)
 		m_freem(pd->pd_m);
@@ -1876,7 +1880,7 @@ pfsync_update_state(struct pf_state *st)
 
 	if (sync || (time_uptime - st->pfsync_time) < 2) {
 		pfsync_upds++;
-		schednetisr(NETISR_PFSYNC);
+		swi_sched(V_pfsync_swi_cookie, 0);
 	}
 }
 
@@ -1916,7 +1920,7 @@ pfsync_request_update(u_int32_t creatori
 	TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
 	sc->sc_len += nlen;
 
-	schednetisr(NETISR_PFSYNC);
+	swi_sched(V_pfsync_swi_cookie, 0);
 }
 
 static void
@@ -1940,7 +1944,7 @@ pfsync_update_state_req(struct pf_state 
 		pfsync_q_del(st);
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD);
-		schednetisr(NETISR_PFSYNC);
+		swi_sched(V_pfsync_swi_cookie, 0);
 		return;
 
 	case PFSYNC_S_INS:
@@ -2050,6 +2054,7 @@ pfsync_q_ins(struct pf_state *st, int q)
 	sc->sc_len += nlen;
 	TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
 	st->sync_state = q;
+	pf_ref_state(st);
 }
 
 static void
@@ -2064,6 +2069,7 @@ pfsync_q_del(struct pf_state *st)
 	sc->sc_len -= pfsync_qs[q].len;
 	TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
 	st->sync_state = PFSYNC_S_NONE;
+	pf_release_state(st);
 
 	if (TAILQ_EMPTY(&sc->sc_qs[q]))
 		sc->sc_len -= sizeof(struct pfsync_subheader);
@@ -2099,7 +2105,7 @@ pfsync_update_tdb(struct tdb *t, int out
 		t->tdb_updates = 0;
 	} else {
 		if (++t->tdb_updates >= sc->sc_maxupdates)
-			schednetisr(NETISR_PFSYNC);
+			swi_sched(V_pfsync_swi_cookie, 0);
 	}
 
 	if (output)
@@ -2168,61 +2174,74 @@ pfsync_bulk_start(void)
 		printf("pfsync: received bulk update request\n");
 
 	PF_LOCK_ASSERT();
-	if (TAILQ_EMPTY(&V_state_list))
-		pfsync_bulk_status(PFSYNC_BUS_END);
-	else {
-		sc->sc_ureq_received = time_uptime;
-		if (sc->sc_bulk_next == NULL)
-			sc->sc_bulk_next = TAILQ_FIRST(&V_state_list);
-		sc->sc_bulk_last = sc->sc_bulk_next;
 
-		pfsync_bulk_status(PFSYNC_BUS_START);
-		callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
-	}
+	sc->sc_ureq_received = time_uptime;
+	sc->sc_bulk_hash_id = 0;
+	bzero(&sc->sc_bulk_state, sizeof(struct pf_state_cmp));
+	pfsync_bulk_status(PFSYNC_BUS_START);
+	callout_reset(&sc->sc_bulk_tmo, 1, pfsync_bulk_update, sc);
 }
 
 static void
 pfsync_bulk_update(void *arg)
 {
 	struct pfsync_softc *sc = arg;
-	struct pf_state *st = sc->sc_bulk_next;
-	int i = 0;
+	struct pf_state *s;
+	int i, sent = 0;
 
 	PF_LOCK_ASSERT();
-
 	CURVNET_SET(sc->sc_ifp->if_vnet);
-	for (;;) {
-		if (st->sync_state == PFSYNC_S_NONE &&
-		    st->timeout < PFTM_MAX &&
-		    st->pfsync_time <= sc->sc_ureq_received) {
-			pfsync_update_state_req(st);
-			i++;
-		}
 
-		PF_LIST_RLOCK();
-		st = TAILQ_NEXT(st, entry_list);
-		if (st == NULL)
-			st = TAILQ_FIRST(&V_state_list);
-		PF_LIST_RUNLOCK();
+	/*
+	 * Start with last state from previous invocation.
+	 * It may had gone, in this case start from the
+	 * hash slot.
+	 */
+	s = pf_find_state_byid(&sc->sc_bulk_state);
 
-		if (st == sc->sc_bulk_last) {
-			/* we're done */
-			sc->sc_bulk_next = NULL;
-			sc->sc_bulk_last = NULL;
-			pfsync_bulk_status(PFSYNC_BUS_END);
-			break;
+	if (s != NULL)
+		i = PF_IDHASH(s);
+	else
+		i = sc->sc_bulk_hash_id;
+
+	for (; i <= V_pf_hashmask; i++) {
+		struct pf_idhash *ih = &V_pf_idhash[i];
+
+		if (s != NULL)
+			PF_HASHROW_ASSERT(ih);
+		else {
+			PF_HASHROW_LOCK(ih);
+			s = LIST_FIRST(&ih->states);
 		}
 
-		if (i > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
-		    sizeof(struct pfsync_state)) {
-			/* we've filled a packet */
-			sc->sc_bulk_next = st;
-			callout_reset(&sc->sc_bulk_tmo, 1,
-			    pfsync_bulk_update, sc);
-			break;
+		for (; s; s = LIST_NEXT(s, entry)) {
+
+			if (sent > 1 && (sc->sc_ifp->if_mtu - sc->sc_len) <
+			    sizeof(struct pfsync_state)) {
+				/* We've filled a packet. */
+				sc->sc_bulk_hash_id = i;
+				bcopy(s, &sc->sc_bulk_state,
+				    sizeof(struct pf_state_cmp));
+				PF_HASHROW_UNLOCK(ih);
+				callout_reset(&sc->sc_bulk_tmo, 1,
+				    pfsync_bulk_update, sc);
+				goto full;
+			}
+
+			if (s->sync_state == PFSYNC_S_NONE &&
+			    s->timeout < PFTM_MAX &&
+			    s->pfsync_time <= sc->sc_ureq_received) {
+				pfsync_update_state_req(s);
+				sent++;
+			}
 		}
+		PF_HASHROW_UNLOCK(ih);
 	}
 
+	/* We're done. */
+	pfsync_bulk_status(PFSYNC_BUS_END);
+
+full:
 	CURVNET_RESTORE();
 }
 
@@ -2307,22 +2326,6 @@ pfsync_up(void)
 	return (1);
 }
 
-static int
-pfsync_state_in_use(struct pf_state *st)
-{
-	struct pfsync_softc *sc = V_pfsyncif;
-
-	if (sc == NULL)
-		return (0);
-
-	if (st->sync_state != PFSYNC_S_NONE ||
-	    st == sc->sc_bulk_next ||
-	    st == sc->sc_bulk_last)
-		return (1);
-
-	return (0);
-}
-
 static u_int pfsync_ints;
 static u_int pfsync_tmos;
 
@@ -2486,7 +2489,6 @@ pfsync_init()
 	pfsync_update_state_ptr = pfsync_update_state;
 	pfsync_delete_state_ptr = pfsync_delete_state;
 	pfsync_clear_states_ptr = pfsync_clear_states;
-	pfsync_state_in_use_ptr = pfsync_state_in_use;
 	pfsync_defer_ptr = pfsync_defer;
 	PF_UNLOCK();
 
@@ -2520,7 +2522,6 @@ pfsync_uninit()
 	pfsync_update_state_ptr = NULL;
 	pfsync_delete_state_ptr = NULL;
 	pfsync_clear_states_ptr = NULL;
-	pfsync_state_in_use_ptr = NULL;
 	pfsync_defer_ptr = NULL;
 	PF_UNLOCK();
 

Modified: projects/pf/head/sys/contrib/pf/net/pf.c
==============================================================================
--- projects/pf/head/sys/contrib/pf/net/pf.c	Mon Apr  2 15:07:22 2012	(r233781)
+++ projects/pf/head/sys/contrib/pf/net/pf.c	Mon Apr  2 16:13:54 2012	(r233782)
@@ -62,7 +62,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/random.h>
 #include <sys/sysctl.h>
 #include <sys/endian.h>
-#define	betoh64		be64toh
 #include <sys/proc.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
@@ -124,8 +123,6 @@ extern int ip_optcopy(struct ip *, struc
  */
 
 /* state tables */
-VNET_DEFINE(struct pf_state_tree,	 pf_statetbl);
-
 VNET_DEFINE(struct pf_altqqueue,	 pf_altqs[2]);
 VNET_DEFINE(struct pf_palist,		 pf_pabuf);
 VNET_DEFINE(struct pf_altqqueue *,	 pf_altqs_active);
@@ -191,8 +188,9 @@ static void		 pf_send_icmp(struct mbuf *
 			    sa_family_t, struct pf_rule *);
 static void		 pf_detach_state(struct pf_state *);
 static int		 pf_state_key_attach(struct pf_state_key *,
-			    struct pf_state *, int);
+			    struct pf_state_key *, struct pf_state *);
 static void		 pf_state_key_detach(struct pf_state *, int);
+static int		 pf_state_key_ini(void *, int, int);
 static u_int32_t	 pf_tcp_iss(struct pf_pdesc *);
 static int		 pf_test_rule(struct pf_rule **, struct pf_state **,
 			    int, struct pfi_kif *, struct mbuf *, int,
@@ -250,16 +248,17 @@ static void		 pf_print_state_parts(struc
 			    struct pf_state_key *, struct pf_state_key *);
 static int		 pf_addr_wrap_neq(struct pf_addr_wrap *,
 			    struct pf_addr_wrap *);
+#if 0
 static int		 pf_compare_state_keys(struct pf_state_key *,
 			    struct pf_state_key *, struct pfi_kif *, u_int);
+#endif
 static struct pf_state	*pf_find_state(struct pfi_kif *,
-			    struct pf_state_key_cmp *, u_int, struct mbuf *,
-			    struct pf_mtag *);
+			    struct pf_state_key_cmp *, u_int);
 static int		 pf_src_connlimit(struct pf_state **);
 static int		 pf_insert_src_node(struct pf_src_node **,
 			    struct pf_rule *, struct pf_addr *, sa_family_t);
 static int		 pf_check_congestion(struct ifqueue *);
-static int		 pf_purge_expired_states(u_int32_t , int);
+static void		 pf_purge_expired_states(int);
 
 int in4_cksum(struct mbuf *m, u_int8_t nxt, int off, int len);
 
@@ -267,26 +266,22 @@ VNET_DECLARE(int, pf_end_threads);
 
 VNET_DEFINE(struct pf_pool_limit, pf_pool_limits[PF_LIMIT_MAX]);
 
-#define	PPACKET_LOOPED()						\
-	(pd->pf_mtag->flags & PF_PACKET_LOOPED)
+#define	PACKET_LOOPED(pd)	((pd)->pf_mtag->flags & PF_PACKET_LOOPED)
 
-#define	PACKET_LOOPED()							\
-	(pd.pf_mtag->flags & PF_PACKET_LOOPED)
-
-#define	STATE_LOOKUP(i, k, d, s, m, pt)					\
+#define	STATE_LOOKUP(i, k, d, s, pd)					\
 	do {								\
-		s = pf_find_state(i, k, d, m, pt);			\
-		if (s == NULL || (s)->timeout == PFTM_PURGE)		\
+		(s) = pf_find_state((i), (k), (d));			\
+		if ((s) == NULL || (s)->timeout == PFTM_PURGE)		\
 			return (PF_DROP);				\
-		if (PPACKET_LOOPED())					\
+		if (PACKET_LOOPED(pd))					\
 			return (PF_PASS);				\
-		if (d == PF_OUT &&					\
+		if ((d) == PF_OUT &&					\
 		    (((s)->rule.ptr->rt == PF_ROUTETO &&		\
 		    (s)->rule.ptr->direction == PF_OUT) ||		\
 		    ((s)->rule.ptr->rt == PF_REPLYTO &&			\
 		    (s)->rule.ptr->direction == PF_IN)) &&		\
 		    (s)->rt_kif != NULL &&				\
-		    (s)->rt_kif != i)					\
+		    (s)->rt_kif != (i))					\
 			return (PF_PASS);				\
 	} while (0)
 
@@ -317,20 +312,16 @@ VNET_DEFINE(struct pf_pool_limit, pf_poo
 	} while (0)
 
 static __inline int pf_src_compare(struct pf_src_node *, struct pf_src_node *);
-static __inline int pf_state_compare_key(struct pf_state_key *,
-	struct pf_state_key *);
-static __inline int pf_state_compare_id(struct pf_state *,
-	struct pf_state *);
 
 VNET_DEFINE(struct pf_src_tree,	 	 tree_src_tracking);
 
-VNET_DEFINE(struct pf_state_tree_id,	 tree_id);
-VNET_DEFINE(struct pf_state_queue,	 state_list);
+MALLOC_DEFINE(M_PFHASH, "pf hashes", "pf(4) hash header structures");
+/* XXXGL: make static? */
+VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
+VNET_DEFINE(struct pf_idhash *, pf_idhash);
+VNET_DEFINE(u_long, pf_hashmask);
 
 RB_GENERATE(pf_src_tree, pf_src_node, entry, pf_src_compare);
-RB_GENERATE(pf_state_tree, pf_state_key, entry, pf_state_compare_key);
-RB_GENERATE(pf_state_tree_id, pf_state,
-    entry_id, pf_state_compare_id);
 
 static __inline int
 pf_src_compare(struct pf_src_node *a, struct pf_src_node *b)
@@ -484,37 +475,42 @@ pf_src_connlimit(struct pf_state **state
 
 		/* kill existing states if that's required. */
 		if ((*state)->rule.ptr->flush) {
-			struct pf_state_key *sk;
-			struct pf_state *st;
 
 			V_pf_status.lcounters[LCNT_OVERLOAD_FLUSH]++;
-			PF_IDS_LOCK();
 			/* XXXGL: this cycle should go into a separate taskq */
-			RB_FOREACH(st, pf_state_tree_id, &V_tree_id) {
-				sk = st->key[PF_SK_WIRE];
-				/*
-				 * Kill states from this source.  (Only those
-				 * from the same rule if PF_FLUSH_GLOBAL is not
-				 * set)
-				 */
-				if (sk->af ==
-				    (*state)->key[PF_SK_WIRE]->af &&
-				    (((*state)->direction == PF_OUT &&
-				    PF_AEQ(&(*state)->src_node->addr,
-					&sk->addr[0], sk->af)) ||
-				    ((*state)->direction == PF_IN &&
-				    PF_AEQ(&(*state)->src_node->addr,
-					&sk->addr[1], sk->af))) &&
-				    ((*state)->rule.ptr->flush &
-				    PF_FLUSH_GLOBAL ||
-				    (*state)->rule.ptr == st->rule.ptr)) {
-					st->timeout = PFTM_PURGE;
-					st->src.state = st->dst.state =
-					    TCPS_CLOSED;
-					killed++;
+			for (int i = 0; i <= V_pf_hashmask; i++) {
+				struct pf_idhash *ih = &V_pf_idhash[i];
+				struct pf_state_key *sk;
+				struct pf_state *s;
+
+				PF_HASHROW_LOCK(ih);
+				LIST_FOREACH(s, &ih->states, entry) {
+					sk = s->key[PF_SK_WIRE];
+					/*
+					 * Kill states from this source.
+					 * (Only those from the same rule if
+					 * PF_FLUSH_GLOBAL is not set)
+					 */
+					if (sk->af ==
+					    (*state)->key[PF_SK_WIRE]->af &&
+					    (((*state)->direction == PF_OUT &&
+					    PF_AEQ(&(*state)->src_node->addr,
+						&sk->addr[0], sk->af)) ||
+					    ((*state)->direction == PF_IN &&
+					    PF_AEQ(&(*state)->src_node->addr,
+						&sk->addr[1], sk->af))) &&
+					    ((*state)->rule.ptr->flush &
+					    PF_FLUSH_GLOBAL ||
+					    (*state)->rule.ptr == s->rule.ptr))
+					{
+						s->timeout = PFTM_PURGE;
+						s->src.state = s->dst.state =
+						    TCPS_CLOSED;
+						killed++;
+					}
 				}
+				PF_HASHROW_UNLOCK(ih);
 			}
-			PF_IDS_UNLOCK();
 			if (V_pf_status.debug >= PF_DEBUG_MISC)
 				printf(", %u states killed", killed);
 		}
@@ -591,103 +587,184 @@ pf_insert_src_node(struct pf_src_node **
 	return (0);
 }
 
-/* state table stuff */
-
-static __inline int
-pf_state_compare_key(struct pf_state_key *a, struct pf_state_key *b)
+/*
+ * Hash function shamelessly taken from ng_netflow(4), trusting
+ * mav@ and melifaro@ data on its decent distribution.
+ */
+static __inline u_int
+pf_hashkey(struct pf_state_key *sk)
 {
-	int	diff;
+	u_int h;
 
-	if ((diff = a->proto - b->proto) != 0)
-		return (diff);
-	if ((diff = a->af - b->af) != 0)
-		return (diff);
-	switch (a->af) {
-#ifdef INET
+#define	FULL_HASH(a1, a2, p1, p2)	\
+	(((a1) ^ ((a1) >> 16) ^		\
+	htons((a2) ^ ((a2) >> 16))) ^	\
+	(p1) ^ htons(p2))
+ 
+#define	ADDR_HASH(a1, a2)		\
+	((a1) ^ ((a1) >> 16) ^		\
+	htons((a2) ^ ((a2) >> 16)))
+
+	switch (sk->af) {
 	case AF_INET:
-		if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
-			return (1);
-		if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
-			return (-1);
-		if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
-			return (1);
-		if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
-			return (-1);
+		switch (sk->proto) {
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+			h = FULL_HASH(sk->addr[0].v4.s_addr,
+			    sk->addr[1].v4.s_addr, sk->port[0], sk->port[1]);
+			break;
+		default:
+			h = ADDR_HASH(sk->addr[0].v4.s_addr,
+			    sk->addr[1].v4.s_addr);
+			break;
+		}
 		break;
-#endif /* INET */
-#ifdef INET6
 	case AF_INET6:
-		if (a->addr[0].addr32[3] > b->addr[0].addr32[3])
-			return (1);
-		if (a->addr[0].addr32[3] < b->addr[0].addr32[3])
-			return (-1);
-		if (a->addr[1].addr32[3] > b->addr[1].addr32[3])
-			return (1);
-		if (a->addr[1].addr32[3] < b->addr[1].addr32[3])
-			return (-1);
-		if (a->addr[0].addr32[2] > b->addr[0].addr32[2])
-			return (1);
-		if (a->addr[0].addr32[2] < b->addr[0].addr32[2])
-			return (-1);
-		if (a->addr[1].addr32[2] > b->addr[1].addr32[2])
-			return (1);
-		if (a->addr[1].addr32[2] < b->addr[1].addr32[2])
-			return (-1);
-		if (a->addr[0].addr32[1] > b->addr[0].addr32[1])
-			return (1);
-		if (a->addr[0].addr32[1] < b->addr[0].addr32[1])
-			return (-1);
-		if (a->addr[1].addr32[1] > b->addr[1].addr32[1])
-			return (1);
-		if (a->addr[1].addr32[1] < b->addr[1].addr32[1])
-			return (-1);
-		if (a->addr[0].addr32[0] > b->addr[0].addr32[0])
-			return (1);
-		if (a->addr[0].addr32[0] < b->addr[0].addr32[0])
-			return (-1);
-		if (a->addr[1].addr32[0] > b->addr[1].addr32[0])
-			return (1);
-		if (a->addr[1].addr32[0] < b->addr[1].addr32[0])
-			return (-1);
+		switch (sk->proto) {
+		case IPPROTO_TCP:
+		case IPPROTO_UDP:
+			h = FULL_HASH(sk->addr[0].v6.__u6_addr.__u6_addr32[3],
+			    sk->addr[1].v6.__u6_addr.__u6_addr32[3],
+			    sk->port[0], sk->port[1]);
+			break;
+		default:
+			h = ADDR_HASH(sk->addr[0].v6.__u6_addr.__u6_addr32[3],
+			    sk->addr[1].v6.__u6_addr.__u6_addr32[3]);
+			break;
+		}
 		break;
-#endif /* INET6 */
+	default:
+		panic("%s: unknown address family %u", __func__, sk->af);
 	}
 
-	if ((diff = a->port[0] - b->port[0]) != 0)
-		return (diff);
-	if ((diff = a->port[1] - b->port[1]) != 0)
-		return (diff);
-
-	return (0);
+	return (h & V_pf_hashmask);
 }
 
-static __inline int
-pf_state_compare_id(struct pf_state *a, struct pf_state *b)
+/* Data storage structures initialization. */
+void
+pf_initialize()
 {
-	if (a->id > b->id)
-		return (1);
-	if (a->id < b->id)
-		return (-1);
-	if (a->creatorid > b->creatorid)
-		return (1);
-	if (a->creatorid < b->creatorid)
-		return (-1);
+	struct pf_keyhash	*kh;
+	struct pf_idhash	*ih;
+	u_int i;
+
+	/* States and state keys storage. */
+	V_pf_state_z = uma_zcreate("pf states", sizeof(struct pf_state),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	V_pf_pool_limits[PF_LIMIT_STATES].pp = V_pf_state_z;
+        uma_zone_set_max(V_pf_state_z, PFSTATE_HIWAT);
+
+	V_pf_state_key_z = uma_zcreate("pf state keys",
+	    sizeof(struct pf_state_key), NULL, NULL, pf_state_key_ini, NULL,
+	    UMA_ALIGN_PTR, 0);
+	V_pf_keyhash = malloc(PF_HASHSIZ * sizeof(struct pf_keyhash), M_PFHASH,
+	    M_WAITOK|M_ZERO);
+	V_pf_idhash = malloc(PF_HASHSIZ * sizeof(struct pf_idhash), M_PFHASH,
+	    M_WAITOK|M_ZERO);
+	V_pf_hashmask = PF_HASHSIZ - 1;
+	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
+	    i++, kh++, ih++) {
+		mtx_init(&kh->lock, "pf_keyhash", NULL, MTX_DEF);
+		mtx_init(&ih->lock, "pf_idhash", NULL, MTX_DEF);
+	}
+
+	/* Source nodes. */
+	V_pf_src_tree_z = uma_zcreate("pf src nodes",
+	    sizeof(struct pf_src_node), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+	    0);
+	V_pf_pool_limits[PF_LIMIT_SRC_NODES].pp = V_pf_src_tree_z;
+	RB_INIT(&V_tree_src_tracking);
+
+	/* ALTQ */
+	V_pf_altq_z = uma_zcreate("pf altq", sizeof(struct pf_altq),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	TAILQ_INIT(&V_pf_altqs[0]);
+	TAILQ_INIT(&V_pf_altqs[1]);
+	TAILQ_INIT(&V_pf_pabuf);
+	V_pf_altqs_active = &V_pf_altqs[0];
+	V_pf_altqs_inactive = &V_pf_altqs[1];
+
+	/* XXXGL: sort this out */
+	V_pf_rule_z = uma_zcreate("pf rules", sizeof(struct pf_rule),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	V_pf_pooladdr_z = uma_zcreate("pf pool addresses",
+	    sizeof(struct pf_pooladdr), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+	    0);
+	V_pfr_ktable_z = uma_zcreate("pf tables",
+	    sizeof(struct pfr_ktable), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+	    0);
+	V_pf_pool_limits[PF_LIMIT_TABLES].pp = V_pfr_ktable_z;
+	V_pfr_kentry_z = uma_zcreate("pf table entries",
+	    sizeof(struct pfr_kentry), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
+	    0);
+	V_pf_pool_limits[PF_LIMIT_TABLE_ENTRIES].pp = V_pfr_kentry_z;
+	V_pfi_addr_z = uma_zcreate("pf pfi_dynaddr", sizeof(struct pfi_dynaddr),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
 
-	return (0);
+void
+pf_cleanup()
+{
+	struct pf_keyhash	*kh;
+	struct pf_idhash	*ih;
+	u_int i;
+
+	for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
+	    i++, kh++, ih++) {
+		KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
+		    __func__));
+		KASSERT(LIST_EMPTY(&ih->states), ("%s: id hash not empty",
+		    __func__));
+		mtx_destroy(&kh->lock);
+		mtx_destroy(&ih->lock);
+	}
+	free(V_pf_keyhash, M_PFHASH);
+	free(V_pf_idhash, M_PFHASH);
+
+	uma_zdestroy(V_pf_src_tree_z);
+	uma_zdestroy(V_pf_rule_z);
+	uma_zdestroy(V_pf_state_z);
+	uma_zdestroy(V_pf_state_key_z);
+	uma_zdestroy(V_pf_altq_z);
+	uma_zdestroy(V_pf_pooladdr_z);
+	uma_zdestroy(V_pfr_ktable_z);
+	uma_zdestroy(V_pfr_kentry_z);
+	uma_zdestroy(V_pfi_addr_z);
 }
 
 static int
-pf_state_key_attach(struct pf_state_key *sk, struct pf_state *s, int idx)
+pf_state_key_attach(struct pf_state_key *skw, struct pf_state_key *sks,
+    struct pf_state *s)
 {
-	struct pf_state_key	*cur;
+	struct pf_keyhash	*kh;
+	struct pf_state_key	*sk, *cur;
 	struct pf_state		*si, *olds = NULL;
+	int idx;
+
+	KASSERT(s->refs == 0, ("%s: state not pristine", __func__));
+	KASSERT(s->key[PF_SK_WIRE] == NULL, ("%s: state has key", __func__));
+	KASSERT(s->key[PF_SK_STACK] == NULL, ("%s: state has key", __func__));
+
+	/*
+	 * First run: start with wire key.
+	 */
+	sk = skw;
+	idx = PF_SK_WIRE;
+
+keyattach:
+	kh = &V_pf_keyhash[pf_hashkey(sk)];
 
-	PF_KEYS_ASSERT();
-	KASSERT(s->key[idx] == NULL, ("%s: a key already attached", __func__));
+	PF_HASHROW_LOCK(kh);
+	LIST_FOREACH(cur, &kh->keys, entry)
+		if (bcmp(cur, sk, sizeof(struct pf_state_key_cmp)) == 0)
+			break;
+
+	if (cur != NULL) {
+		/* Key exists. Check for same kif, if none, add to key. */
+		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx]) {
+			struct pf_idhash *ih = &V_pf_idhash[PF_IDHASH(si)];
 
-	if ((cur = RB_INSERT(pf_state_tree, &V_pf_statetbl, sk)) != NULL) {
-		/* key exists. check for same kif, if none, add to key */
-		TAILQ_FOREACH(si, &cur->states[idx], key_list[idx])
+			PF_HASHROW_LOCK(ih);
 			if (si->kif == s->kif &&
 			    si->direction == s->direction) {
 				if (sk->proto == IPPROTO_TCP &&
@@ -696,6 +773,7 @@ pf_state_key_attach(struct pf_state_key 
 					si->src.state = si->dst.state =
 					    TCPS_CLOSED;
 					/* Unlink later or cur can go away. */
+					pf_ref_state(si);
 					olds = si;
 				} else {
 					if (V_pf_status.debug >= PF_DEBUG_MISC) {
@@ -717,29 +795,63 @@ pf_state_key_attach(struct pf_state_key 
 						    sk : NULL);
 						printf("\n");
 					}
+					PF_HASHROW_UNLOCK(ih);
+					PF_HASHROW_UNLOCK(kh);
 					uma_zfree(V_pf_state_key_z, sk);
+					if (idx == PF_SK_STACK)
+						pf_detach_state(s);
 					return (-1);	/* collision! */
 				}
 			}
-		/*
-		 * Collided key may be the same we are trying to attach,
-		 * this happens for non-NAT states, they are attached
-		 * twice: via PF_SK_WIRE and PF_SK_STACK tailqs.
-		 */
-		if (cur != sk)
-			uma_zfree(V_pf_state_key_z, sk);
+			PF_HASHROW_UNLOCK(ih);
+		}
+		uma_zfree(V_pf_state_key_z, sk);
 		s->key[idx] = cur;
-	} else
+	} else {
+		LIST_INSERT_HEAD(&kh->keys, sk, entry);
 		s->key[idx] = sk;
+	}
 
-	/* list is sorted, if-bound states before floating */
+stateattach:
+	/* List is sorted, if-bound states before floating. */
 	if (s->kif == V_pfi_all)
 		TAILQ_INSERT_TAIL(&s->key[idx]->states[idx], s, key_list[idx]);
 	else
 		TAILQ_INSERT_HEAD(&s->key[idx]->states[idx], s, key_list[idx]);
 
-	if (olds)
+	/*
+	 * Attach done. See how should we (or should not?)
+	 * attach a second key.
+	 */
+	if (sks == skw) {
+		s->key[PF_SK_STACK] = s->key[PF_SK_WIRE];
+		idx = PF_SK_STACK;
+		sks = NULL;
+		goto stateattach;
+	} else if (sks != NULL) {
+		PF_HASHROW_UNLOCK(kh);
+		if (olds) {
+			pf_unlink_state(olds, 0);
+			pf_release_state(olds);
+			olds = NULL;
+		}
+		/*
+		 * Continue attaching with stack key.
+		 */
+		sk = sks;
+		idx = PF_SK_STACK;
+		sks = NULL;
+		goto keyattach;
+	} else
+		PF_HASHROW_UNLOCK(kh);
+
+	if (olds) {
 		pf_unlink_state(olds, 0);
+		pf_release_state(olds);
+	}
+
+	KASSERT(s->key[PF_SK_WIRE] != NULL && s->key[PF_SK_STACK] != NULL,
+	    ("%s failure", __func__));
 
 	return (0);
 }
@@ -747,34 +859,67 @@ pf_state_key_attach(struct pf_state_key 
 static void

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201204021613.q32GDtGw005249>