From owner-svn-src-all@freebsd.org Mon Aug 3 12:13:58 2015 Return-Path: Delivered-To: svn-src-all@mailman.ysv.freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:1900:2254:206a::19:1]) by mailman.ysv.freebsd.org (Postfix) with ESMTP id 847119B1A7A; Mon, 3 Aug 2015 12:13:58 +0000 (UTC) (envelope-from jch@FreeBSD.org) Received: from repo.freebsd.org (repo.freebsd.org [IPv6:2001:1900:2254:2068::e6a:0]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (Client did not present a certificate) by mx1.freebsd.org (Postfix) with ESMTPS id 742913B2; Mon, 3 Aug 2015 12:13:58 +0000 (UTC) (envelope-from jch@FreeBSD.org) Received: from repo.freebsd.org ([127.0.1.70]) by repo.freebsd.org (8.14.9/8.14.9) with ESMTP id t73CDw9F090939; Mon, 3 Aug 2015 12:13:58 GMT (envelope-from jch@FreeBSD.org) Received: (from jch@localhost) by repo.freebsd.org (8.14.9/8.14.9/Submit) id t73CDtrg090930; Mon, 3 Aug 2015 12:13:55 GMT (envelope-from jch@FreeBSD.org) Message-Id: <201508031213.t73CDtrg090930@repo.freebsd.org> X-Authentication-Warning: repo.freebsd.org: jch set sender to jch@FreeBSD.org using -f From: Julien Charbon Date: Mon, 3 Aug 2015 12:13:55 +0000 (UTC) To: src-committers@freebsd.org, svn-src-all@freebsd.org, svn-src-head@freebsd.org Subject: svn commit: r286227 - in head/sys: dev/cxgb/ulp/tom dev/cxgbe/tom netinet netinet6 X-SVN-Group: head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit X-BeenThere: svn-src-all@freebsd.org X-Mailman-Version: 2.1.20 Precedence: list List-Id: "SVN commit messages for the entire src tree \(except for " user" and " projects" \)" List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 03 Aug 2015 12:13:58 -0000 Author: jch Date: Mon Aug 3 12:13:54 2015 New Revision: 286227 URL: https://svnweb.freebsd.org/changeset/base/286227 Log: Decompose TCP INP_INFO lock to increase short-lived TCP connections scalability: - The existing TCP INP_INFO lock continues to protect the global inpcb list stability during full list traversal (e.g. tcp_pcblist()). - A new INP_LIST lock protects inpcb list actual modifications (inp allocation and free) and inpcb global counters. It allows to use TCP INP_INFO_RLOCK lock in critical paths (e.g. tcp_input()) and INP_INFO_WLOCK only in occasional operations that walk all connections. PR: 183659 Differential Revision: https://reviews.freebsd.org/D2599 Reviewed by: jhb, adrian Tested by: adrian, nitroboost-gmail.com Sponsored by: Verisign, Inc. Modified: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c head/sys/dev/cxgb/ulp/tom/cxgb_listen.c head/sys/dev/cxgbe/tom/t4_connect.c head/sys/dev/cxgbe/tom/t4_cpl_io.c head/sys/dev/cxgbe/tom/t4_listen.c head/sys/netinet/in_pcb.c head/sys/netinet/in_pcb.h head/sys/netinet/tcp_input.c head/sys/netinet/tcp_subr.c head/sys/netinet/tcp_syncache.c head/sys/netinet/tcp_timer.c head/sys/netinet/tcp_timewait.c head/sys/netinet/tcp_usrreq.c head/sys/netinet/toecore.c head/sys/netinet6/in6_pcb.c Modified: head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c ============================================================================== --- head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c Mon Aug 3 11:57:11 2015 (r286226) +++ head/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c Mon Aug 3 12:13:54 2015 (r286227) @@ -639,7 +639,7 @@ t3_send_fin(struct toedev *tod, struct t unsigned int tid = toep->tp_tid; #endif - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); CTR4(KTR_CXGB, "%s: tid %d, toep %p, flags %x", __func__, tid, toep, @@ -925,12 +925,12 @@ do_act_open_rpl(struct sge_qset *qs, str rc = act_open_rpl_status_to_errno(s); if (rc != EAGAIN) - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); toe_connect_failed(tod, inp, rc); toepcb_release(toep); /* unlocks inp */ if (rc != EAGAIN) - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); @@ -1061,7 +1061,7 @@ send_reset(struct toepcb *toep) struct adapter *sc = tod->tod_softc; struct mbuf *m; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); CTR4(KTR_CXGB, "%s: tid %d, toep %p (%x)", __func__, tid, toep, @@ -1172,12 +1172,12 @@ do_rx_data(struct sge_qset *qs, struct r SOCKBUF_UNLOCK(so_rcv); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); @@ -1222,7 +1222,7 @@ do_peer_close(struct sge_qset *qs, struc struct tcpcb *tp; struct socket *so; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1250,7 +1250,7 @@ do_peer_close(struct sge_qset *qs, struc case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); toepcb_release(toep); /* no more CPLs expected */ @@ -1264,7 +1264,7 @@ do_peer_close(struct sge_qset *qs, struc done: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); @@ -1285,7 +1285,7 @@ do_close_con_rpl(struct sge_qset *qs, st struct tcpcb *tp; struct socket *so; - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1303,7 +1303,7 @@ do_close_con_rpl(struct sge_qset *qs, st tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); toepcb_release(toep); /* no more CPLs expected */ @@ -1328,7 +1328,7 @@ release: done: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); @@ -1489,7 +1489,7 @@ do_abort_req(struct sge_qset *qs, struct return (do_abort_req_synqe(qs, r, m)); inp = toep->tp_inp; - INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */ + INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1503,7 +1503,7 @@ do_abort_req(struct sge_qset *qs, struct toep->tp_flags |= TP_ABORT_REQ_RCVD; toep->tp_flags |= TP_ABORT_SHUTDOWN; INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } @@ -1523,7 +1523,7 @@ do_abort_req(struct sge_qset *qs, struct INP_WLOCK(inp); /* re-acquire */ toepcb_release(toep); /* no more CPLs expected */ } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); send_abort_rpl(tod, tid, qset); m_freem(m); Modified: head/sys/dev/cxgb/ulp/tom/cxgb_listen.c ============================================================================== --- head/sys/dev/cxgb/ulp/tom/cxgb_listen.c Mon Aug 3 11:57:11 2015 (r286226) +++ head/sys/dev/cxgb/ulp/tom/cxgb_listen.c Mon Aug 3 12:13:54 2015 (r286227) @@ -541,11 +541,11 @@ do_pass_accept_req(struct sge_qset *qs, REJECT_PASS_ACCEPT(); /* no l2te, or ifp mismatch */ } - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); REJECT_PASS_ACCEPT(); } @@ -558,7 +558,7 @@ do_pass_accept_req(struct sge_qset *qs, * resources tied to this listen context. */ INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); REJECT_PASS_ACCEPT(); } so = inp->inp_socket; @@ -686,7 +686,7 @@ do_pass_establish(struct sge_qset *qs, s struct toepcb *toep; struct socket *so; struct listen_ctx *lctx = synqe->lctx; - struct inpcb *inp = lctx->inp; + struct inpcb *inp = lctx->inp, *new_inp; struct tcpopt to; struct tcphdr th; struct in_conninfo inc; @@ -700,7 +700,7 @@ do_pass_establish(struct sge_qset *qs, s KASSERT(qs->idx == synqe->qset, ("%s qset mismatch %d %d", __func__, qs->idx, synqe->qset)); - INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */ + INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ INP_WLOCK(inp); if (__predict_false(inp->inp_flags & INP_DROPPED)) { @@ -714,7 +714,7 @@ do_pass_establish(struct sge_qset *qs, s ("%s: listen socket dropped but tid %u not aborted.", __func__, tid)); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } @@ -730,7 +730,7 @@ do_pass_establish(struct sge_qset *qs, s reset: t3_send_reset_synqe(tod, synqe); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); m_freem(m); return (0); } @@ -748,21 +748,23 @@ reset: goto reset; } - if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) { - struct inpcb *new_inp = sotoinpcb(so); + /* New connection inpcb is already locked by syncache_expand(). */ + new_inp = sotoinpcb(so); + INP_WLOCK_ASSERT(new_inp); - INP_WLOCK(new_inp); + if (__predict_false(!(synqe->flags & TP_SYNQE_EXPANDED))) { tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t3_offload_socket(tod, synqe, so); - INP_WUNLOCK(new_inp); } + INP_WUNLOCK(new_inp); + /* Remove the synq entry and release its reference on the lctx */ TAILQ_REMOVE(&lctx->synq, synqe, link); inp = release_lctx(td, lctx); if (inp) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); release_synqe(synqe); m_freem(m); @@ -1128,7 +1130,7 @@ t3_offload_socket(struct toedev *tod, vo struct cpl_pass_establish *cpl = synqe->cpl; struct toepcb *toep = synqe->toep; - INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); offload_socket(so, toep); Modified: head/sys/dev/cxgbe/tom/t4_connect.c ============================================================================== --- head/sys/dev/cxgbe/tom/t4_connect.c Mon Aug 3 11:57:11 2015 (r286226) +++ head/sys/dev/cxgbe/tom/t4_connect.c Mon Aug 3 12:13:54 2015 (r286227) @@ -189,12 +189,12 @@ act_open_failure_cleanup(struct adapter toep->tid = -1; if (status != EAGAIN) - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); } static int Modified: head/sys/dev/cxgbe/tom/t4_cpl_io.c ============================================================================== --- head/sys/dev/cxgbe/tom/t4_cpl_io.c Mon Aug 3 11:57:11 2015 (r286226) +++ head/sys/dev/cxgbe/tom/t4_cpl_io.c Mon Aug 3 12:13:54 2015 (r286227) @@ -1085,7 +1085,7 @@ do_peer_close(struct sge_iq *iq, const s KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1127,7 +1127,7 @@ do_peer_close(struct sge_iq *iq, const s case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); final_cpl_received(toep); @@ -1139,7 +1139,7 @@ do_peer_close(struct sge_iq *iq, const s } done: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } @@ -1166,7 +1166,7 @@ do_close_con_rpl(struct sge_iq *iq, cons KASSERT(m == NULL, ("%s: wasn't expecting payload", __func__)); KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1184,7 +1184,7 @@ do_close_con_rpl(struct sge_iq *iq, cons tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); INP_WLOCK(inp); final_cpl_received(toep); /* no more CPLs expected */ @@ -1208,7 +1208,7 @@ release: } done: INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } @@ -1367,7 +1367,7 @@ do_abort_req(struct sge_iq *iq, const st } inp = toep->inp; - INP_INFO_WLOCK(&V_tcbinfo); /* for tcp_close */ + INP_INFO_RLOCK(&V_tcbinfo); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1401,7 +1401,7 @@ do_abort_req(struct sge_iq *iq, const st final_cpl_received(toep); done: - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); } @@ -1515,12 +1515,12 @@ do_rx_data(struct sge_iq *iq, const stru SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); + INP_INFO_RLOCK(&V_tcbinfo); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } Modified: head/sys/dev/cxgbe/tom/t4_listen.c ============================================================================== --- head/sys/dev/cxgbe/tom/t4_listen.c Mon Aug 3 11:57:11 2015 (r286226) +++ head/sys/dev/cxgbe/tom/t4_listen.c Mon Aug 3 12:13:54 2015 (r286227) @@ -930,7 +930,7 @@ t4_offload_socket(struct toedev *tod, vo struct cpl_pass_establish *cpl = mtod(synqe->syn, void *); struct toepcb *toep = *(struct toepcb **)(cpl + 1); - INP_INFO_LOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); @@ -1259,15 +1259,15 @@ do_pass_accept_req(struct sge_iq *iq, co REJECT_PASS_ACCEPT(); rpl = wrtod(wr); - INP_INFO_WLOCK(&V_tcbinfo); /* for 4-tuple check */ + INP_INFO_RLOCK(&V_tcbinfo); /* for 4-tuple check */ /* Don't offload if the 4-tuple is already in use */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); free(wr, M_CXGBE); REJECT_PASS_ACCEPT(); } - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); inp = lctx->inp; /* listening socket, not owned by TOE */ INP_WLOCK(inp); @@ -1441,7 +1441,7 @@ do_pass_establish(struct sge_iq *iq, con unsigned int tid = GET_TID(cpl); struct synq_entry *synqe = lookup_tid(sc, tid); struct listen_ctx *lctx = synqe->lctx; - struct inpcb *inp = lctx->inp; + struct inpcb *inp = lctx->inp, *new_inp; struct socket *so; struct tcphdr th; struct tcpopt to; @@ -1459,7 +1459,7 @@ do_pass_establish(struct sge_iq *iq, con KASSERT(synqe->flags & TPF_SYNQE, ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); - INP_INFO_WLOCK(&V_tcbinfo); /* for syncache_expand */ + INP_INFO_RLOCK(&V_tcbinfo); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, @@ -1475,7 +1475,7 @@ do_pass_establish(struct sge_iq *iq, con } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } @@ -1500,7 +1500,7 @@ reset: */ send_reset_synqe(TOEDEV(ifp), synqe); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (0); } toep->tid = tid; @@ -1534,6 +1534,10 @@ reset: goto reset; } + /* New connection inpcb is already locked by syncache_expand(). */ + new_inp = sotoinpcb(so); + INP_WLOCK_ASSERT(new_inp); + /* * This is for the unlikely case where the syncache entry that we added * has been evicted from the syncache, but the syncache_expand above @@ -1544,20 +1548,18 @@ reset: * this somewhat defeats the purpose of having a tod_offload_socket :-( */ if (__predict_false(!(synqe->flags & TPF_SYNQE_EXPANDED))) { - struct inpcb *new_inp = sotoinpcb(so); - - INP_WLOCK(new_inp); tcp_timer_activate(intotcpcb(new_inp), TT_KEEP, 0); t4_offload_socket(TOEDEV(ifp), synqe, so); - INP_WUNLOCK(new_inp); } + INP_WUNLOCK(new_inp); + /* Done with the synqe */ TAILQ_REMOVE(&lctx->synq, synqe, link); inp = release_lctx(sc, lctx); if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); release_synqe(synqe); return (0); Modified: head/sys/netinet/in_pcb.c ============================================================================== --- head/sys/netinet/in_pcb.c Mon Aug 3 11:57:11 2015 (r286226) +++ head/sys/netinet/in_pcb.c Mon Aug 3 12:13:54 2015 (r286227) @@ -224,6 +224,7 @@ in_pcbinfo_init(struct inpcbinfo *pcbinf INP_INFO_LOCK_INIT(pcbinfo, name); INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ + INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif @@ -262,6 +263,7 @@ in_pcbinfo_destroy(struct inpcbinfo *pcb in_pcbgroup_destroy(pcbinfo); #endif uma_zdestroy(pcbinfo->ipi_zone); + INP_LIST_LOCK_DESTROY(pcbinfo); INP_HASH_LOCK_DESTROY(pcbinfo); INP_INFO_LOCK_DESTROY(pcbinfo); } @@ -276,7 +278,14 @@ in_pcballoc(struct socket *so, struct in struct inpcb *inp; int error; - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_RLOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif + error = 0; inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) @@ -308,6 +317,8 @@ in_pcballoc(struct socket *so, struct in inp->inp_flags |= IN6P_IPV6_V6ONLY; } #endif + INP_WLOCK(inp); + INP_LIST_WLOCK(pcbinfo); LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); pcbinfo->ipi_count++; so->so_pcb = (caddr_t)inp; @@ -315,9 +326,9 @@ in_pcballoc(struct socket *so, struct in if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif - INP_WLOCK(inp); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ + INP_LIST_WUNLOCK(pcbinfo); #if defined(IPSEC) || defined(MAC) out: if (error != 0) { @@ -1246,7 +1257,13 @@ in_pcbfree(struct inpcb *inp) KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_RLOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif INP_WLOCK_ASSERT(inp); /* XXXRW: Do as much as possible here. */ @@ -1254,8 +1271,10 @@ in_pcbfree(struct inpcb *inp) if (inp->inp_sp != NULL) ipsec_delete_pcbpolicy(inp); #endif + INP_LIST_WLOCK(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; in_pcbremlists(inp); + INP_LIST_WUNLOCK(pcbinfo); #ifdef INET6 if (inp->inp_vflag & INP_IPV6PROTO) { ip6_freepcbopts(inp->in6p_outputopts); @@ -1412,7 +1431,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo struct ip_moptions *imo; int i, gap; - INP_INFO_RLOCK(pcbinfo); + INP_INFO_WLOCK(pcbinfo); LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { INP_WLOCK(inp); imo = inp->inp_moptions; @@ -1442,7 +1461,7 @@ in_pcbpurgeif0(struct inpcbinfo *pcbinfo } INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(pcbinfo); + INP_INFO_WUNLOCK(pcbinfo); } /* @@ -2163,8 +2182,16 @@ in_pcbremlists(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - INP_INFO_WLOCK_ASSERT(pcbinfo); +#ifdef INVARIANTS + if (pcbinfo == &V_tcbinfo) { + INP_INFO_RLOCK_ASSERT(pcbinfo); + } else { + INP_INFO_WLOCK_ASSERT(pcbinfo); + } +#endif + INP_WLOCK_ASSERT(inp); + INP_LIST_WLOCK_ASSERT(pcbinfo); inp->inp_gencnt = ++pcbinfo->ipi_gencnt; if (inp->inp_flags & INP_INHASHLIST) { @@ -2309,13 +2336,13 @@ inp_apply_all(void (*func)(struct inpcb { struct inpcb *inp; - INP_INFO_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { INP_WLOCK(inp); func(inp, arg); INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * Modified: head/sys/netinet/in_pcb.h ============================================================================== --- head/sys/netinet/in_pcb.h Mon Aug 3 11:57:11 2015 (r286226) +++ head/sys/netinet/in_pcb.h Mon Aug 3 12:13:54 2015 (r286227) @@ -130,23 +130,35 @@ struct in_conninfo { struct icmp6_filter; /*- - * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 - * and IPv6 sockets. In the case of TCP, further per-connection state is + * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and + * IPv6 sockets. In the case of TCP and UDP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb * are static after creation or protected by a per-inpcb rwlock, inp_lock. A - * few fields also require the global pcbinfo lock for the inpcb to be held, - * when modified, such as the global connection lists and hashes, as well as - * binding information (which affects which hash a connection is on). This - * model means that connections can be looked up without holding the - * per-connection lock, which is important for performance when attempting to - * find the connection for a packet given its IP and port tuple. Writing to - * these fields that write locks be held on both the inpcb and global locks. + * few fields are protected by multiple locks as indicated in the locking notes + * below. For these fields, all of the listed locks must be write-locked for + * any modifications. However, these fields can be safely read while any one of + * the listed locks are read-locked. This model can permit greater concurrency + * for read operations. For example, connections can be looked up while only + * holding a read lock on the global pcblist lock. This is important for + * performance when attempting to find the connection for a packet given its IP + * and port tuple. + * + * One noteworthy exception is that the global pcbinfo lock follows a different + * set of rules in relation to the inp_list field. Rather than being + * write-locked for modifications and read-locked for list iterations, it must + * be read-locked during modifications and write-locked during list iterations. + * This ensures that the relatively rare global list iterations safely walk a + * stable snapshot of connections while allowing more common list modifications + * to safely grab the pcblist lock just while adding or removing a connection + * from the global list. * * Key: * (c) - Constant after initialization * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb + * (l) - Protected by the pcblist lock for the inpcb + * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking * @@ -161,15 +173,21 @@ struct icmp6_filter; * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). + * + * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock + * read-lock usage during modification, this model can be applied to other + * protocols (especially SCTP). */ struct inpcb { - LIST_ENTRY(inpcb) inp_hash; /* (i/p) hash list */ + LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */ LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ - LIST_ENTRY(inpcb) inp_list; /* (i/p) list for all PCBs for proto */ + LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ + /* (p[w]) for list iteration */ + /* (p[r]/l) for addition/removal */ void *inp_ppcb; /* (i) pointer to per-protocol pcb */ struct inpcbinfo *inp_pcbinfo; /* (c) PCB list info */ struct inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */ - LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/p) group wildcard entry */ + LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */ struct socket *inp_socket; /* (i) back pointer to socket */ struct ucred *inp_cred; /* (c) cache of socket cred */ u_int32_t inp_flow; /* (i) IPv6 flow information */ @@ -188,7 +206,7 @@ struct inpcb { * general use */ /* Local and foreign ports, local and foreign addr. */ - struct in_conninfo inp_inc; /* (i/p) list for PCB's local port */ + struct in_conninfo inp_inc; /* (i) list for PCB's local port */ /* MAC and IPSEC policy information. */ struct label *inp_label; /* (i) MAC label */ @@ -213,8 +231,8 @@ struct inpcb { int inp6_cksum; short inp6_hops; } inp_depend6; - LIST_ENTRY(inpcb) inp_portlist; /* (i/p) */ - struct inpcbport *inp_phd; /* (i/p) head of this list */ + LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ + struct inpcbport *inp_phd; /* (i/h) head of this list */ #define inp_zero_size offsetof(struct inpcb, inp_gencnt) inp_gen_t inp_gencnt; /* (c) generation count */ struct llentry *inp_lle; /* cached L2 information */ @@ -279,37 +297,46 @@ struct inpcbport { * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * - * Each pcbinfo is protected by two locks: ipi_lock and ipi_hash_lock, - * the former covering mutable global fields (such as the global pcb list), - * and the latter covering the hashed lookup tables. The lock order is: + * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and + * ipi_list_lock: + * - ipi_lock covering the global pcb list stability during loop iteration, + * - ipi_hash_lock covering the hashed lookup tables, + * - ipi_list_lock covering mutable global fields (such as the global + * pcb list) + * + * The lock order is: * - * ipi_lock (before) inpcb locks (before) {ipi_hash_lock, pcbgroup locks} + * ipi_lock (before) + * inpcb locks (before) + * ipi_list locks (before) + * {ipi_hash_lock, pcbgroup locks} * * Locking key: * * (c) Constant or nearly constant after initialisation * (g) Locked by ipi_lock + * (l) Locked by ipi_list_lock * (h) Read using either ipi_hash_lock or inpcb lock; write requires both * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* - * Global lock protecting global inpcb list, inpcb count, etc. + * Global lock protecting full inpcb list traversal */ struct rwlock ipi_lock; /* * Global list of inpcbs on the protocol. */ - struct inpcbhead *ipi_listhead; /* (g) */ - u_int ipi_count; /* (g) */ + struct inpcbhead *ipi_listhead; /* (g/l) */ + u_int ipi_count; /* (l) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ - u_quad_t ipi_gencnt; /* (g) */ + u_quad_t ipi_gencnt; /* (l) */ /* * Fields associated with port lookup and allocation. @@ -367,6 +394,11 @@ struct inpcbinfo { * general use 2 */ void *ipi_pspare[2]; + + /* + * Global lock protecting global inpcb list, inpcb count, etc. + */ + struct rwlock ipi_list_lock; }; #ifdef _KERNEL @@ -466,6 +498,25 @@ short inp_so_options(const struct inpcb #define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) #define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) +#define INP_LIST_LOCK_INIT(ipi, d) \ + rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) +#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock) +#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock) +#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock) +#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock) +#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock) +#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock) +#define INP_LIST_LOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED) +#define INP_LIST_RLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED) +#define INP_LIST_WLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED) +#define INP_LIST_UNLOCK_ASSERT(ipi) \ + rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) + #define INP_HASH_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) #define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) Modified: head/sys/netinet/tcp_input.c ============================================================================== --- head/sys/netinet/tcp_input.c Mon Aug 3 11:57:11 2015 (r286226) +++ head/sys/netinet/tcp_input.c Mon Aug 3 12:13:54 2015 (r286227) @@ -608,7 +608,7 @@ tcp_input(struct mbuf **mp, int *offp, i char *s = NULL; /* address and port logging */ int ti_locked; #define TI_UNLOCKED 1 -#define TI_WLOCKED 2 +#define TI_RLOCKED 2 #ifdef TCPDEBUG /* @@ -797,8 +797,8 @@ tcp_input(struct mbuf **mp, int *offp, i * connection in TIMEWAIT and SYNs not targeting a listening socket. */ if ((thflags & (TH_FIN | TH_RST)) != 0) { - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; } else ti_locked = TI_UNLOCKED; @@ -820,8 +820,8 @@ tcp_input(struct mbuf **mp, int *offp, i findpcb: #ifdef INVARIANTS - if (ti_locked == TI_WLOCKED) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } @@ -969,20 +969,20 @@ findpcb: relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; goto findpcb; } } else - ti_locked = TI_WLOCKED; + ti_locked = TI_RLOCKED; } - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); @@ -991,7 +991,7 @@ relocked: */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_WUNLOCK(&V_tcbinfo); + INP_INFO_RUNLOCK(&V_tcbinfo); return (IPPROTO_DONE); } /* @@ -1022,16 +1022,16 @@ relocked: */ #ifdef INVARIANTS if ((thflags & (TH_FIN | TH_RST)) != 0) - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); #endif if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) || (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN)))) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_WLOCK(&V_tcbinfo) == 0) { + if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { in_pcbref(inp); INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - ti_locked = TI_WLOCKED; + INP_INFO_RLOCK(&V_tcbinfo); + ti_locked = TI_RLOCKED; INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { inp = NULL; @@ -1039,9 +1039,9 @@ relocked: } goto relocked; } else - ti_locked = TI_WLOCKED; + ti_locked = TI_RLOCKED; } - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } #ifdef MAC @@ -1096,7 +1096,7 @@ relocked: */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1148,7 +1148,11 @@ relocked: */ INP_WUNLOCK(inp); /* listen socket */ inp = sotoinpcb(so); - INP_WLOCK(inp); /* new connection */ + /* + * New connection inpcb is already locked by + * syncache_expand(). + */ + INP_WLOCK_ASSERT(inp); tp = intotcpcb(inp); KASSERT(tp->t_state == TCPS_SYN_RECEIVED, ("%s: ", __func__)); @@ -1379,8 +1383,8 @@ relocked: * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); @@ -1429,8 +1433,8 @@ relocked: dropwithreset: TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1453,8 +1457,8 @@ dropunlock: if (m != NULL) TCP_PROBE5(receive, NULL, tp, mtod(m, const char *), tp, th); - if (ti_locked == TI_WLOCKED) { - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) { + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; } #ifdef INVARIANTS @@ -1511,13 +1515,13 @@ tcp_do_segment(struct mbuf *m, struct tc */ if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || tp->t_state != TCPS_ESTABLISHED) { - KASSERT(ti_locked == TI_WLOCKED, ("%s ti_locked %d for " + KASSERT(ti_locked == TI_RLOCKED, ("%s ti_locked %d for " "SYN/FIN/RST/!EST", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } else { #ifdef INVARIANTS - if (ti_locked == TI_WLOCKED) - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); else { KASSERT(ti_locked == TI_UNLOCKED, ("%s: EST " "ti_locked: %d", __func__, ti_locked)); @@ -1690,8 +1694,8 @@ tcp_do_segment(struct mbuf *m, struct tc /* * This is a pure ack for outstanding data. */ - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; TCPSTAT_INC(tcps_predack); @@ -1794,8 +1798,8 @@ tcp_do_segment(struct mbuf *m, struct tc * nothing on the reassembly queue and we have enough * buffer space to take it. */ - if (ti_locked == TI_WLOCKED) - INP_INFO_WUNLOCK(&V_tcbinfo); + if (ti_locked == TI_RLOCKED) + INP_INFO_RUNLOCK(&V_tcbinfo); ti_locked = TI_UNLOCKED; /* Clean receiver SACK report if present */ @@ -2031,9 +2035,9 @@ tcp_do_segment(struct mbuf *m, struct tc tcp_state_change(tp, TCPS_SYN_RECEIVED); } - KASSERT(ti_locked == TI_WLOCKED, ("%s: trimthenstep6: " + KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: " "ti_locked %d", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2106,8 +2110,8 @@ tcp_do_segment(struct mbuf *m, struct tc SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - KASSERT(ti_locked == TI_WLOCKED, + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(ti_locked == TI_RLOCKED, ("%s: TH_RST ti_locked %d, th %p tp %p", __func__, ti_locked, th, tp)); KASSERT(tp->t_state != TCPS_SYN_SENT, @@ -2150,9 +2154,9 @@ tcp_do_segment(struct mbuf *m, struct tc * Send challenge ACK for any SYN in synchronized state. */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT) { - KASSERT(ti_locked == TI_WLOCKED, + KASSERT(ti_locked == TI_RLOCKED, ("tcp_do_segment: TH_SYN ti_locked %d", ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && @@ -2265,9 +2269,9 @@ tcp_do_segment(struct mbuf *m, struct tc */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - KASSERT(ti_locked == TI_WLOCKED, ("%s: SS_NOFDEREF && " + KASSERT(ti_locked == TI_RLOCKED, ("%s: SS_NOFDEREF && " "CLOSE_WAIT && tlen ti_locked %d", __func__, ti_locked)); - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " @@ -2768,9 +2772,9 @@ process_ACK: */ case TCPS_CLOSING: *** DIFF OUTPUT TRUNCATED AT 1000 LINES ***