Skip site navigation (1)Skip section navigation (2)
Date:      Thu, 1 Aug 2019 14:17:32 +0000 (UTC)
From:      Randall Stewart <rrs@FreeBSD.org>
To:        src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org
Subject:   svn commit: r350501 - in head/sys: conf dev/cxgbe dev/mlx5/mlx5_en net netinet
Message-ID:  <201908011417.x71EHW0j029518@repo.freebsd.org>

next in thread | raw e-mail | index | archive | help
Author: rrs
Date: Thu Aug  1 14:17:31 2019
New Revision: 350501
URL: https://svnweb.freebsd.org/changeset/base/350501

Log:
  This adds the third step in getting BBR into the tree. BBR and
  an updated rack depend on having access to the new
  ratelimit api in this commit.
  
  Sponsored by:	Netflix Inc.
  Differential Revision:	https://reviews.freebsd.org/D20953

Added:
  head/sys/netinet/tcp_ratelimit.c   (contents, props changed)
  head/sys/netinet/tcp_ratelimit.h   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/dev/cxgbe/adapter.h
  head/sys/dev/cxgbe/t4_main.c
  head/sys/dev/cxgbe/t4_sched.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/net/if_dead.c
  head/sys/net/if_lagg.c
  head/sys/net/if_var.h
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/conf/files	Thu Aug  1 14:17:31 2019	(r350501)
@@ -4276,6 +4276,7 @@ netinet/tcp_lro.c		optional inet | inet6
 netinet/tcp_output.c		optional inet | inet6
 netinet/tcp_offload.c		optional tcp_offload inet | tcp_offload inet6
 netinet/tcp_hpts.c              optional tcphpts inet | tcphpts inet6
+netinet/tcp_ratelimit.c         optional ratelimit inet | ratelimit inet6
 netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcppcap \
 	compile-with "${NORMAL_C} ${NO_WNONNULL}"
 netinet/tcp_reass.c		optional inet | inet6

Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/dev/cxgbe/adapter.h	Thu Aug  1 14:17:31 2019	(r350501)
@@ -1247,6 +1247,7 @@ int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_
 int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
 void cxgbe_snd_tag_free(struct m_snd_tag *);
 void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
+void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *);
 #endif
 
 /* t4_filter.c */

Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/dev/cxgbe/t4_main.c	Thu Aug  1 14:17:31 2019	(r350501)
@@ -1658,6 +1658,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
 	ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
 	ifp->if_snd_tag_query = cxgbe_snd_tag_query;
 	ifp->if_snd_tag_free = cxgbe_snd_tag_free;
+	ifp->if_ratelimit_query = cxgbe_ratelimit_query;
 #endif
 
 	ifp->if_capabilities = T4_CAP;

Modified: head/sys/dev/cxgbe/t4_sched.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sched.c	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/dev/cxgbe/t4_sched.c	Thu Aug  1 14:17:31 2019	(r350501)
@@ -903,4 +903,35 @@ cxgbe_snd_tag_free(struct m_snd_tag *mst)
 	}
 	mtx_unlock(&cst->lock);
 }
+
+#define CXGBE_MAX_FLOWS 4000	/* Testing show so far thats all this adapter can do */
+#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup */
+
+void
+cxgbe_ratelimit_query(struct ifnet *ifp __unused,
+     struct if_ratelimit_query_results *q)
+{
+	/*
+	 * This is a skeleton and needs future work
+	 * by the driver supporters. It should be
+	 * enhanced to look at the specific type of
+	 * interface and select approprate values
+	 * for these settings. This example goes
+	 * with an earlier card (t5), it has a maximum
+	 * number of 16 rates that the first guys in
+	 * select (thus the flags value RT_IS_SELECTABLE).
+	 * If it was a fixed table then we would setup a
+	 * const array (example mlx5). Note the card tested
+	 * can only support reasonably 4000 flows before
+	 * the adapter has issues with sending so here 
+	 * we limit the number of flows using hardware
+	 * pacing to that number, other cards may
+	 * be able to raise or eliminate this limit.
+	 */
+	q->rate_table = NULL;
+	q->flags = RT_IS_SELECTABLE;
+	q->max_flows = CXGBE_MAX_FLOWS;
+	q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT;
+	q->min_segment_burst = 4;	/* Driver emits 4 in a burst */
+}
 #endif

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Thu Aug  1 14:17:31 2019	(r350501)
@@ -4070,7 +4070,49 @@ mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_sn
 	}
 }
 
+#define NUM_HDWR_RATES_MLX 13
+static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
+	135375,			/* 1,083,000 */
+	180500,			/* 1,444,000 */
+	270750,			/* 2,166,000 */
+	361000,			/* 2,888,000 */
+	541500,			/* 4,332,000 */
+	721875,			/* 5,775,000 */
+	1082875,		/* 8,663,000 */
+	1443875,		/* 11,551,000 */
+	2165750,		/* 17,326,000 */
+	2887750,		/* 23,102,000 */
+	4331625,		/* 34,653,000 */
+	5775500,		/* 46,204,000 */
+	8663125			/* 69,305,000 */
+};
+
 static void
+mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+	/*
+	 * This function needs updating by the driver maintainer!
+	 * For the MLX card there are currently (ConectX-4?) 13 
+	 * pre-set rates and others i.e. ConnectX-5, 6, 7??
+	 *
+	 * This will change based on later adapters
+	 * and this code should be updated to look at ifp
+	 * and figure out the specific adapter type
+	 * settings i.e. how many rates as well
+	 * as if they are fixed (as is shown here) or
+	 * if they are dynamic (example chelsio t4). Also if there
+	 * is a maximum number of flows that the adapter
+	 * can handle that too needs to be updated in
+	 * the max_flows field.
+	 */
+	q->rate_table = adapter_rates_mlx;
+	q->flags = RT_IS_FIXED_TABLE;
+	q->max_flows = 0;	/* mlx has no limit */
+	q->number_of_rates = NUM_HDWR_RATES_MLX;
+	q->min_segment_burst = 1;
+}
+
+static void
 mlx5e_snd_tag_free(struct m_snd_tag *pmt)
 {
 	struct mlx5e_snd_tag *tag =
@@ -4155,7 +4197,9 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 	ifp->if_snd_tag_free = mlx5e_snd_tag_free;
 	ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
 	ifp->if_snd_tag_query = mlx5e_snd_tag_query;
-
+#ifdef RATELIMIT
+	ifp->if_ratelimit_query = mlx5e_ratelimit_query;
+#endif
 	/* set TSO limits so that we don't have to drop TX packets */
 	ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
 	ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;

Modified: head/sys/net/if_dead.c
==============================================================================
--- head/sys/net/if_dead.c	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/net/if_dead.c	Thu Aug  1 14:17:31 2019	(r350501)
@@ -126,6 +126,23 @@ ifdead_snd_tag_free(struct m_snd_tag *pmt)
 {
 }
 
+static void
+ifdead_ratelimit_query(struct ifnet *ifp __unused,
+      struct if_ratelimit_query_results *q)
+{
+	/*
+	 * This guy does not support
+	 * this interface. Not sure
+	 * why we would specify a
+	 * flag on the interface
+	 * that says we do.
+	 */
+	q->rate_table = NULL;
+	q->flags = RT_NOSUPPORT;
+	q->max_flows = 0;
+	q->number_of_rates = 0;
+}
+
 void
 if_dead(struct ifnet *ifp)
 {
@@ -142,4 +159,5 @@ if_dead(struct ifnet *ifp)
 	ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
 	ifp->if_snd_tag_query = ifdead_snd_tag_query;
 	ifp->if_snd_tag_free = ifdead_snd_tag_free;
+	ifp->if_ratelimit_query = ifdead_ratelimit_query;
 }

Modified: head/sys/net/if_lagg.c
==============================================================================
--- head/sys/net/if_lagg.c	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/net/if_lagg.c	Thu Aug  1 14:17:31 2019	(r350501)
@@ -144,6 +144,8 @@ static int	lagg_snd_tag_modify(struct m_snd_tag *,
 static int	lagg_snd_tag_query(struct m_snd_tag *,
 		    union if_snd_tag_query_params *);
 static void	lagg_snd_tag_free(struct m_snd_tag *);
+static void     lagg_ratelimit_query(struct ifnet *,
+		    struct if_ratelimit_query_results *);
 #endif
 static int	lagg_setmulti(struct lagg_port *);
 static int	lagg_clrmulti(struct lagg_port *);
@@ -537,6 +539,7 @@ lagg_clone_create(struct if_clone *ifc, int unit, cadd
 	ifp->if_snd_tag_modify = lagg_snd_tag_modify;
 	ifp->if_snd_tag_query = lagg_snd_tag_query;
 	ifp->if_snd_tag_free = lagg_snd_tag_free;
+	ifp->if_ratelimit_query = lagg_ratelimit_query;
 #endif
 	ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
 
@@ -1670,6 +1673,20 @@ lagg_snd_tag_free(struct m_snd_tag *mst)
 	free(lst, M_LAGG);
 }
 
+static void
+lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+	/*
+	 * For lagg, we have an indirect
+	 * interface. The caller needs to
+	 * get a ratelimit tag on the actual
+	 * interface the flow will go on.
+	 */
+	q->rate_table = NULL;
+	q->flags = RT_IS_INDIRECT;
+	q->max_flows = 0;
+	q->number_of_rates = 0;
+}
 #endif
 
 static int

Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/net/if_var.h	Thu Aug  1 14:17:31 2019	(r350501)
@@ -203,6 +203,8 @@ struct if_snd_tag_alloc_header {
 struct if_snd_tag_alloc_rate_limit {
 	struct if_snd_tag_alloc_header hdr;
 	uint64_t max_rate;	/* in bytes/s */
+	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
+	uint32_t reserved;	/* alignment */
 };
 
 struct if_snd_tag_rate_limit_params {
@@ -210,7 +212,7 @@ struct if_snd_tag_rate_limit_params {
 	uint32_t queue_level;	/* 0 (empty) .. 65535 (full) */
 #define	IF_SND_QUEUE_LEVEL_MIN 0
 #define	IF_SND_QUEUE_LEVEL_MAX 65535
-	uint32_t reserved;	/* padding */
+	uint32_t flags;		/* M_NOWAIT or M_WAITOK */
 };
 
 union if_snd_tag_alloc_params {
@@ -229,12 +231,38 @@ union if_snd_tag_query_params {
 	struct if_snd_tag_rate_limit_params unlimited;
 };
 
+/* Query return flags */
+#define RT_NOSUPPORT	  0x00000000	/* Not supported */
+#define RT_IS_INDIRECT    0x00000001	/*
+					 * Interface like a lagg, select
+					 * the actual interface for
+					 * capabilities.
+					 */
+#define RT_IS_SELECTABLE  0x00000002	/*
+					 * No rate table, you select
+					 * rates and the first
+					 * number_of_rates are created.
+					 */
+#define RT_IS_FIXED_TABLE 0x00000004	/* A fixed table is attached */
+#define RT_IS_UNUSABLE	  0x00000008	/* It is not usable for this */
+
+struct if_ratelimit_query_results {
+	const uint64_t *rate_table;	/* Pointer to table if present */
+	uint32_t flags;			/* Flags indicating results */
+	uint32_t max_flows;		/* Max flows using, 0=unlimited */
+	uint32_t number_of_rates;	/* How many unique rates can be created */
+	uint32_t min_segment_burst;	/* The amount the adapter bursts at each send */
+};
+
 typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
     struct m_snd_tag **);
 typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
 typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
 typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
+typedef void (if_ratelimit_query_t)(struct ifnet *,
+    struct if_ratelimit_query_results *);
 
+
 /*
  * Structure defining a network interface.
  */
@@ -374,6 +402,7 @@ struct ifnet {
 	if_snd_tag_modify_t *if_snd_tag_modify;
 	if_snd_tag_query_t *if_snd_tag_query;
 	if_snd_tag_free_t *if_snd_tag_free;
+	if_ratelimit_query_t *if_ratelimit_query;
 
 	/* Ethernet PCP */
 	uint8_t if_pcp;

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/netinet/in_pcb.c	Thu Aug  1 14:17:31 2019	(r350501)
@@ -210,6 +210,22 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtim
 	&VNET_NAME(ipport_randomtime), 0,
 	"Minimum time to keep sequental port "
 	"allocation before switching to a random one");
+
+#ifdef RATELIMIT
+counter_u64_t rate_limit_active;
+counter_u64_t rate_limit_alloc_fail;
+counter_u64_t rate_limit_set_ok;
+
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0,
+    "IP Rate Limiting");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
+    &rate_limit_active, "Active rate limited connections");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
+   &rate_limit_alloc_fail, "Rate limited connection failures");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
+   &rate_limit_set_ok, "Rate limited setting succeeded");
+#endif /* RATELIMIT */
+
 #endif /* INET */
 
 /*
@@ -3170,6 +3186,7 @@ in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_p
 {
 	union if_snd_tag_modify_params params = {
 		.rate_limit.max_rate = max_pacing_rate,
+		.rate_limit.flags = M_NOWAIT,
 	};
 	struct m_snd_tag *mst;
 	struct ifnet *ifp;
@@ -3256,7 +3273,8 @@ in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_tx
  */
 int
 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
-    uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+    uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
+
 {
 	union if_snd_tag_alloc_params params = {
 		.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
@@ -3264,22 +3282,47 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *
 		.rate_limit.hdr.flowid = flowid,
 		.rate_limit.hdr.flowtype = flowtype,
 		.rate_limit.max_rate = max_pacing_rate,
+		.rate_limit.flags = M_NOWAIT,
 	};
 	int error;
 
 	INP_WLOCK_ASSERT(inp);
 
-	if (inp->inp_snd_tag != NULL)
+	if (*st != NULL)
 		return (EINVAL);
 
 	if (ifp->if_snd_tag_alloc == NULL) {
 		error = EOPNOTSUPP;
 	} else {
 		error = ifp->if_snd_tag_alloc(ifp, &params, &inp->inp_snd_tag);
+
+		if (error == 0) {
+			counter_u64_add(rate_limit_set_ok, 1);
+			counter_u64_add(rate_limit_active, 1);
+		} else
+			counter_u64_add(rate_limit_alloc_fail, 1);
 	}
 	return (error);
 }
 
+void
+in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst)
+{
+	if (ifp == NULL)
+		return;
+
+	/*
+	 * If the device was detached while we still had reference(s)
+	 * on the ifp, we assume if_snd_tag_free() was replaced with
+	 * stubs.
+	 */
+	ifp->if_snd_tag_free(mst);
+
+	/* release reference count on network interface */
+	if_rele(ifp);
+	counter_u64_add(rate_limit_active, -1);
+}
+
 /*
  * Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
  * if any:
@@ -3300,49 +3343,12 @@ in_pcbdetach_txrtlmt(struct inpcb *inp)
 	m_snd_tag_rele(mst);
 }
 
-/*
- * This function should be called when the INP_RATE_LIMIT_CHANGED flag
- * is set in the fast path and will attach/detach/modify the TX rate
- * limit send tag based on the socket's so_max_pacing_rate value.
- */
-void
-in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
 {
-	struct socket *socket;
-	uint32_t max_pacing_rate;
-	bool did_upgrade;
 	int error;
 
-	if (inp == NULL)
-		return;
-
-	socket = inp->inp_socket;
-	if (socket == NULL)
-		return;
-
-	if (!INP_WLOCKED(inp)) {
-		/*
-		 * NOTE: If the write locking fails, we need to bail
-		 * out and use the non-ratelimited ring for the
-		 * transmit until there is a new chance to get the
-		 * write lock.
-		 */
-		if (!INP_TRY_UPGRADE(inp))
-			return;
-		did_upgrade = 1;
-	} else {
-		did_upgrade = 0;
-	}
-
 	/*
-	 * NOTE: The so_max_pacing_rate value is read unlocked,
-	 * because atomic updates are not required since the variable
-	 * is checked at every mbuf we send. It is assumed that the
-	 * variable read itself will be atomic.
-	 */
-	max_pacing_rate = socket->so_max_pacing_rate;
-
-	/*
 	 * If the existing send tag is for the wrong interface due to
 	 * a route change, first drop the existing tag.  Set the
 	 * CHANGED flag so that we will keep trying to allocate a new
@@ -3376,13 +3382,61 @@ in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *
 			error = EAGAIN;
 		} else {
 			error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
-			    mb->m_pkthdr.flowid, max_pacing_rate);
+			    mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
 		}
 	} else {
 		error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
 	}
 	if (error == 0 || error == EOPNOTSUPP)
 		inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+
+	return (error);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+	struct socket *socket;
+	uint32_t max_pacing_rate;
+	bool did_upgrade;
+	int error;
+
+	if (inp == NULL)
+		return;
+
+	socket = inp->inp_socket;
+	if (socket == NULL)
+		return;
+
+	if (!INP_WLOCKED(inp)) {
+		/*
+		 * NOTE: If the write locking fails, we need to bail
+		 * out and use the non-ratelimited ring for the
+		 * transmit until there is a new chance to get the
+		 * write lock.
+		 */
+		if (!INP_TRY_UPGRADE(inp))
+			return;
+		did_upgrade = 1;
+	} else {
+		did_upgrade = 0;
+	}
+
+	/*
+	 * NOTE: The so_max_pacing_rate value is read unlocked,
+	 * because atomic updates are not required since the variable
+	 * is checked at every mbuf we send. It is assumed that the
+	 * variable read itself will be atomic.
+	 */
+	max_pacing_rate = socket->so_max_pacing_rate;
+
+	error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
+
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
@@ -3424,4 +3478,14 @@ in_pcboutput_eagain(struct inpcb *inp)
 	if (did_upgrade)
 		INP_DOWNGRADE(inp);
 }
+
+static void
+rl_init(void *st)
+{
+	rate_limit_active = counter_u64_alloc(M_WAITOK);
+	rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
+	rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
+}
+
+SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
 #endif /* RATELIMIT */

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h	Thu Aug  1 14:13:04 2019	(r350500)
+++ head/sys/netinet/in_pcb.h	Thu Aug  1 14:17:31 2019	(r350501)
@@ -883,8 +883,13 @@ struct sockaddr *
 	in_sockaddr(in_port_t port, struct in_addr *addr);
 void	in_pcbsosetlabel(struct socket *so);
 #ifdef RATELIMIT
-int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
+	    struct mbuf *, uint32_t);
+int	in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
+	    uint32_t, struct m_snd_tag **);
 void	in_pcbdetach_txrtlmt(struct inpcb *);
+void    in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst);
 int	in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
 int	in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
 int	in_pcbquery_txrlevel(struct inpcb *, uint32_t *);

Added: head/sys/netinet/tcp_ratelimit.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/netinet/tcp_ratelimit.c	Thu Aug  1 14:17:31 2019	(r350501)
@@ -0,0 +1,1234 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ *	Netflix Inc.
+ *      All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs@netflix.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/mutex.h>
+#include <sys/ck.h>
+#define TCPSTATES		/* for logging */
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_ratelimit.h>
+#ifndef USECS_IN_SECOND
+#define USECS_IN_SECOND 1000000
+#endif
+/*
+ * For the purposes of each send, what is the size
+ * of an ethernet frame.
+ */
+#ifndef ETHERNET_SEGMENT_SIZE
+#define ETHERNET_SEGMENT_SIZE 1500
+#endif
+MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
+#ifdef RATELIMIT
+
+#define COMMON_RATE 180500
+uint64_t desired_rates[] = {
+	62500,			/* 500Kbps */
+	180500,			/* 1.44Mpbs */
+	375000,			/* 3Mbps */
+	500000,			/* 4Mbps */
+	625000,			/* 5Mbps */
+	750000,			/* 6Mbps */
+	1000000,		/* 8Mbps */
+	1250000,		/* 10Mbps */
+	2500000,		/* 20Mbps */
+	3750000,		/* 30Mbps */
+	5000000,		/* 40Meg */
+	6250000,		/* 50Mbps */
+	12500000,		/* 100Mbps */
+	25000000,		/* 200Mbps */
+	50000000,		/* 400Mbps */
+	100000000,		/* 800Mbps */
+	12500,			/* 100kbps */
+	25000,			/* 200kbps */
+	875000,			/* 7Mbps */
+	1125000,		/* 9Mbps */
+	1875000,		/* 15Mbps */
+	3125000,		/* 25Mbps */
+	8125000,		/* 65Mbps */
+	10000000,		/* 80Mbps */
+	18750000,		/* 150Mbps */
+	20000000,		/* 250Mbps */
+	37500000,		/* 350Mbps */
+	62500000,		/* 500Mbps */
+	78125000,		/* 625Mbps */
+	125000000,		/* 1Gbps */
+};
+#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
+#define RS_ORDERED_COUNT 16	/*
+				 * Number that are in order
+				 * at the beginning of the table,
+				 * over this a sort is required.
+				 */
+#define RS_NEXT_ORDER_GROUP 16	/*
+				 * The point in our table where
+				 * we come fill in a second ordered
+				 * group (index wise means -1).
+				 */
+#define ALL_HARDWARE_RATES 1004 /*
+				 * 1Meg - 1Gig in 1 Meg steps
+				 * plus 100, 200k  and 500k and
+				 * 10Gig
+				 */
+
+#define RS_ONE_MEGABIT_PERSEC 1000000
+#define RS_ONE_GIGABIT_PERSEC 1000000000
+#define RS_TEN_GIGABIT_PERSEC 10000000000
+
+static struct head_tcp_rate_set int_rs;
+static struct mtx rs_mtx;
+uint32_t rs_number_alive;
+uint32_t rs_number_dead;
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
+    "TCP Ratelimit stats");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
+    &rs_number_alive, 0,
+    "Number of interfaces initialized for ratelimiting");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
+    &rs_number_dead, 0,
+    "Number of interfaces departing from ratelimiting");
+
+static void
+rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
+{
+	/*
+	 * Add sysctl entries for thus interface.
+	 */
+	if (rs->rs_flags & RS_INTF_NO_SUP) {
+		SYSCTL_ADD_S32(&rs->sysctl_ctx,
+		   SYSCTL_CHILDREN(rl_sysctl_root),
+		   OID_AUTO, "disable", CTLFLAG_RD,
+		   &rs->rs_disable, 0,
+		   "Disable this interface from new hdwr limiting?");
+	} else {
+		SYSCTL_ADD_S32(&rs->sysctl_ctx,
+		   SYSCTL_CHILDREN(rl_sysctl_root),
+		   OID_AUTO, "disable", CTLFLAG_RW,
+		   &rs->rs_disable, 0,
+		   "Disable this interface from new hdwr limiting?");
+	}
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "minseg", CTLFLAG_RW,
+	    &rs->rs_min_seg, 0,
+	    "What is the minimum we need to send on this interface?");
+	SYSCTL_ADD_U64(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "flow_limit", CTLFLAG_RW,
+	    &rs->rs_flow_limit, 0,
+	    "What is the limit for number of flows (0=unlimited)?");
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "highest", CTLFLAG_RD,
+	    &rs->rs_highest_valid, 0,
+	    "Highest valid rate");
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "lowest", CTLFLAG_RD,
+	    &rs->rs_lowest_valid, 0,
+	    "Lowest valid rate");
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "flags", CTLFLAG_RD,
+	    &rs->rs_flags, 0,
+	    "What lags are on the entry?");
+	SYSCTL_ADD_S32(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "numrates", CTLFLAG_RD,
+	    &rs->rs_rate_cnt, 0,
+	    "How many rates re there?");
+	SYSCTL_ADD_U64(&rs->sysctl_ctx,
+	    SYSCTL_CHILDREN(rl_sysctl_root),
+	    OID_AUTO, "flows_using", CTLFLAG_RD,
+	    &rs->rs_flows_using, 0,
+	    "How many flows are using this interface now?");
+#ifdef DETAILED_RATELIMIT_SYSCTL
+	if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
+		/*  Lets display the rates */
+		int i;
+		struct sysctl_oid *rl_rates;
+		struct sysctl_oid *rl_rate_num;
+		char rate_num[16];
+		rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+					    SYSCTL_CHILDREN(rl_sysctl_root),
+					    OID_AUTO,
+					    "rate",
+					    CTLFLAG_RW, 0,
+					    "Ratelist");
+		for( i = 0; i < rs->rs_rate_cnt; i++) {
+			sprintf(rate_num, "%d", i);
+			rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+					    SYSCTL_CHILDREN(rl_rates),
+					    OID_AUTO,
+					    rate_num,
+					    CTLFLAG_RW, 0,
+					    "Individual Rate");
+			SYSCTL_ADD_U32(&rs->sysctl_ctx,
+				       SYSCTL_CHILDREN(rl_rate_num),
+				       OID_AUTO, "flags", CTLFLAG_RD,
+				       &rs->rs_rlt[i].flags, 0,
+				       "Flags on this rate");
+			SYSCTL_ADD_U32(&rs->sysctl_ctx,
+				       SYSCTL_CHILDREN(rl_rate_num),
+				       OID_AUTO, "pacetime", CTLFLAG_RD,
+				       &rs->rs_rlt[i].time_between, 0,
+				       "Time hardware inserts between 1500 byte sends");
+			SYSCTL_ADD_U64(&rs->sysctl_ctx,
+				       SYSCTL_CHILDREN(rl_rate_num),
+				       OID_AUTO, "rate", CTLFLAG_RD,
+				       &rs->rs_rlt[i].rate, 0,
+				       "Rate in bytes per second");
+		}
+	}
+#endif
+}
+
+static void
+rs_destroy(epoch_context_t ctx)
+{
+	struct tcp_rate_set *rs;
+
+	rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
+	mtx_lock(&rs_mtx);
+	rs->rs_flags &= ~RS_FUNERAL_SCHD;
+	if (rs->rs_flows_using == 0) {
+		/*
+		 * In theory its possible (but unlikely)
+		 * that while the delete was occuring
+		 * and we were applying the DEAD flag
+		 * someone slipped in and found the
+		 * interface in a lookup. While we
+		 * decided rs_flows_using were 0 and
+		 * scheduling the epoch_call, the other
+		 * thread incremented rs_flow_using. This
+		 * is because users have a pointer and
+		 * we only use the rs_flows_using in an
+		 * atomic fashion, i.e. the other entities
+		 * are not protected. To assure this did
+		 * not occur, we check rs_flows_using here
+		 * before deleteing.
+		 */
+		sysctl_ctx_free(&rs->sysctl_ctx);
+		free(rs->rs_rlt, M_TCPPACE);
+		free(rs, M_TCPPACE);
+		rs_number_dead--;
+	}
+	mtx_unlock(&rs_mtx);
+
+}
+
+extern counter_u64_t rate_limit_set_ok;
+extern counter_u64_t rate_limit_active;
+extern counter_u64_t rate_limit_alloc_fail;
+
+static int
+rl_attach_txrtlmt(struct ifnet *ifp,
+    uint32_t flowtype,
+    int flowid,
+    uint64_t cfg_rate,
+    struct m_snd_tag **tag)
+{
+	int error;
+	union if_snd_tag_alloc_params params = {
+		.rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+		.rate_limit.hdr.flowid = flowid,
+		.rate_limit.hdr.flowtype = flowtype,
+		.rate_limit.max_rate = cfg_rate,
+		.rate_limit.flags = M_NOWAIT,
+	};
+
+	if (ifp->if_snd_tag_alloc == NULL) {
+		error = EOPNOTSUPP;
+	} else {
+		error = ifp->if_snd_tag_alloc(ifp, &params, tag);
+		if (error == 0) {
+			if_ref((*tag)->ifp);
+			counter_u64_add(rate_limit_set_ok, 1);
+			counter_u64_add(rate_limit_active, 1);
+		} else
+			counter_u64_add(rate_limit_alloc_fail, 1);
+	}
+	return (error);
+}
+
+static void
+populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
+{
+	/*
+	 * The internal table is "special", it
+	 * is two seperate ordered tables that
+	 * must be merged. We get here when the
+	 * adapter specifies a number of rates that
+	 * covers both ranges in the table in some
+	 * form.
+	 */
+	int i, at_low, at_high;
+	uint8_t low_disabled = 0, high_disabled = 0;
+
+	for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
+		rs->rs_rlt[i].flags = 0;
+		rs->rs_rlt[i].time_between = 0;
+		if ((low_disabled == 0) &&
+		    (high_disabled ||
+		     (rate_table_act[at_low] < rate_table_act[at_high]))) {
+			rs->rs_rlt[i].rate = rate_table_act[at_low];
+			at_low++;
+			if (at_low == RS_NEXT_ORDER_GROUP)
+				low_disabled = 1;
+		} else if (high_disabled == 0) {
+			rs->rs_rlt[i].rate = rate_table_act[at_high];
+			at_high++;
+			if (at_high == MAX_HDWR_RATES)
+				high_disabled = 1;
+		}
+	}
+}
+
+static struct tcp_rate_set *
+rt_setup_new_rs(struct ifnet *ifp, int *error)
+{
+	struct tcp_rate_set *rs;
+	const uint64_t *rate_table_act;
+	uint64_t lentim, res;
+	size_t sz;
+	uint32_t hash_type;
+	int i;
+	struct if_ratelimit_query_results rl;
+	struct sysctl_oid *rl_sysctl_root;
+	/*
+	 * We expect to enter with the 
+	 * mutex locked.
+	 */
+
+	if (ifp->if_ratelimit_query == NULL) {
+		/*
+		 * We can do nothing if we cannot
+		 * get a query back from the driver.
+		 */
+		return (NULL);
+	}
+	rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
+	if (rs == NULL) {
+		if (error)
+			*error = ENOMEM;
+		return (NULL);
+	}
+	rl.flags = RT_NOSUPPORT;
+	ifp->if_ratelimit_query(ifp, &rl);
+	if (rl.flags & RT_IS_UNUSABLE) {
+		/* 
+		 * The interface does not really support 
+		 * the rate-limiting.
+		 */
+		memset(rs, 0, sizeof(struct tcp_rate_set));
+		rs->rs_ifp = ifp;
+		rs->rs_if_dunit = ifp->if_dunit;
+		rs->rs_flags = RS_INTF_NO_SUP;
+		rs->rs_disable = 1;
+		rs_number_alive++;
+		sysctl_ctx_init(&rs->sysctl_ctx);
+		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+		    OID_AUTO,
+		    rs->rs_ifp->if_xname,
+		    CTLFLAG_RW, 0,
+		    "");
+		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+		/* Unlock to allow the sysctl stuff to allocate */
+		mtx_unlock(&rs_mtx);
+		rl_add_syctl_entries(rl_sysctl_root, rs);
+		/* re-lock for our caller */
+		mtx_lock(&rs_mtx);
+		return (rs);
+	} else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
+		memset(rs, 0, sizeof(struct tcp_rate_set));
+		rs->rs_ifp = ifp;
+		rs->rs_if_dunit = ifp->if_dunit;
+		rs->rs_flags = RS_IS_DEFF;
+		rs_number_alive++;
+		sysctl_ctx_init(&rs->sysctl_ctx);
+		rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+		    SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+		    OID_AUTO,
+		    rs->rs_ifp->if_xname,
+		    CTLFLAG_RW, 0,
+		    "");
+		CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+		/* Unlock to allow the sysctl stuff to allocate */
+		mtx_unlock(&rs_mtx);
+		rl_add_syctl_entries(rl_sysctl_root, rs);
+		/* re-lock for our caller */
+		mtx_lock(&rs_mtx);
+		return (rs);
+	} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
+		/* Mellanox most likely */
+		rs->rs_ifp = ifp;
+		rs->rs_if_dunit = ifp->if_dunit;
+		rs->rs_rate_cnt = rl.number_of_rates;
+		rs->rs_min_seg = rl.min_segment_burst;
+		rs->rs_highest_valid = 0;
+		rs->rs_flow_limit = rl.max_flows;
+		rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
+		rs->rs_disable = 0;
+		rate_table_act = rl.rate_table;
+	} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
+		/* Chelsio */
+		rs->rs_ifp = ifp;
+		rs->rs_if_dunit = ifp->if_dunit;
+		rs->rs_rate_cnt = rl.number_of_rates;
+		rs->rs_min_seg = rl.min_segment_burst;
+		rs->rs_disable = 0;
+		rs->rs_flow_limit = rl.max_flows;
+		rate_table_act = desired_rates;
+		if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
+		    (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
+			/*
+			 * Our desired table is not big
+			 * enough, do what we can.
+			 */
+			rs->rs_rate_cnt = MAX_HDWR_RATES;
+		 }
+		if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
+			rs->rs_flags = RS_IS_INTF;
+		else
+			rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
+		if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
+			rs->rs_rate_cnt = ALL_HARDWARE_RATES;
+	} else {
+		printf("Interface:%s unit:%d not one known to have rate-limits\n",
+		    ifp->if_dname,
+		    ifp->if_dunit);
+		free(rs, M_TCPPACE);
+		return (NULL);
+	}
+	sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
+	rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
+	if (rs->rs_rlt == NULL) {
+		if (error)
+			*error = ENOMEM;
+bail:
+		free(rs, M_TCPPACE);
+		return (NULL);
+	}
+	if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
+		/*
+		 * The interface supports all
+		 * the rates we could possibly want.
+		 */
+		uint64_t rat;
+
+		rs->rs_rlt[0].rate = 12500;	/* 100k */
+		rs->rs_rlt[1].rate = 25000;	/* 200k */
+		rs->rs_rlt[2].rate = 62500;	/* 500k */
+		/* Note 125000 == 1Megabit
+		 * populate 1Meg - 1000meg.
+		 */
+		for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
+			rs->rs_rlt[i].rate = rat;
+			rat += 125000;
+		}
+		rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
+	} else if (rs->rs_flags & RS_INT_TBL) {
+		/* We populate this in a special way */
+		populate_canned_table(rs, rate_table_act);
+	} else {
+		/*

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?201908011417.x71EHW0j029518>