From owner-p4-projects@FreeBSD.ORG Thu Jan 24 07:36:02 2008 Return-Path: Delivered-To: p4-projects@freebsd.org Received: by hub.freebsd.org (Postfix, from userid 32767) id E278116A468; Thu, 24 Jan 2008 07:36:01 +0000 (UTC) Delivered-To: perforce@freebsd.org Received: from mx1.freebsd.org (mx1.freebsd.org [IPv6:2001:4f8:fff6::34]) by hub.freebsd.org (Postfix) with ESMTP id A641316A418 for ; Thu, 24 Jan 2008 07:36:01 +0000 (UTC) (envelope-from kmacy@freebsd.org) Received: from repoman.freebsd.org (repoman.freebsd.org [IPv6:2001:4f8:fff6::29]) by mx1.freebsd.org (Postfix) with ESMTP id 99C9713C4DB for ; Thu, 24 Jan 2008 07:36:01 +0000 (UTC) (envelope-from kmacy@freebsd.org) Received: from repoman.freebsd.org (localhost [127.0.0.1]) by repoman.freebsd.org (8.14.1/8.14.1) with ESMTP id m0O7a18o093492 for ; Thu, 24 Jan 2008 07:36:01 GMT (envelope-from kmacy@freebsd.org) Received: (from perforce@localhost) by repoman.freebsd.org (8.14.1/8.14.1/Submit) id m0O7a1ED093489 for perforce@freebsd.org; Thu, 24 Jan 2008 07:36:01 GMT (envelope-from kmacy@freebsd.org) Date: Thu, 24 Jan 2008 07:36:01 GMT Message-Id: <200801240736.m0O7a1ED093489@repoman.freebsd.org> X-Authentication-Warning: repoman.freebsd.org: perforce set sender to kmacy@freebsd.org using -f From: Kip Macy To: Perforce Change Reviews Cc: Subject: PERFORCE change 133977 for review X-BeenThere: p4-projects@freebsd.org X-Mailman-Version: 2.1.5 Precedence: list List-Id: p4 projects tree changes List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 24 Jan 2008 07:36:02 -0000 http://perforce.freebsd.org/chv.cgi?CH=133977 Change 133977 by kmacy@kmacy:storage:toehead on 2008/01/24 07:35:33 import cpl_io ddp support Affected files ... .. //depot/projects/toehead/sys/dev/cxgb/sys/mvec.h#6 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#5 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#4 edit .. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#4 edit Differences ... ==== //depot/projects/toehead/sys/dev/cxgb/sys/mvec.h#6 (text+ko) ==== @@ -48,9 +48,10 @@ extern int cxgb_mbufs_outstanding; extern int cxgb_pack_outstanding; -#define mtomv(m) ((struct mbuf_vec *)((m)->m_pktdat)) -#define M_IOVEC 0x100000 /* mbuf immediate data area is used for cluster ptrs */ -#define EXT_PHYS 10 /* physical/bus address */ +#define mtomv(m) ((struct mbuf_vec *)((m)->m_pktdat)) +#define M_IOVEC 0x100000 /* mbuf immediate data area is used for cluster ptrs */ +#define M_DDP 0x200000 /* direct data placement mbuf */ +#define EXT_PHYS 10 /* physical/bus address */ /* @@ -74,6 +75,11 @@ #define EXT_CLIOVEC 9 #define EXT_JMPIOVEC 10 +#define m_cur_offset m_ext.ext_size /* override to provide ddp offset */ +#define m_seq m_pkthdr.csum_data /* stored sequence */ +#define m_ddp_gl m_ext.ext_buf /* ddp list */ +#define m_ddp_flags m_pkthdr.csum_flags /* ddp flags */ +#define m_ulp_mode m_ext.ext_type /* upper level protocol */ extern uma_zone_t zone_miovec; ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#5 (text+ko) ==== @@ -478,6 +478,14 @@ return (credits); } +/* + * Returns true if a socket cannot accept new Rx data. + */ +static inline int +so_no_receive(const struct socket *so) +{ + return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING)); +} /* * Set of states for which we should return RX credits. @@ -1465,6 +1473,253 @@ } /* + * Returns true if we need to explicitly request RST when we receive new data + * on an RX-closed connection. + */ +static inline int +need_rst_on_excess_rx(const struct toepcb *toep) +{ + return (1); +} + +/* + * Handles Rx data that arrives in a state where the socket isn't accepting + * new data. + */ +static void +handle_excess_rx(struct toepcb *toep, struct mbuf *m) +{ + + if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN)) + t3_send_reset(toep); + m_freem(m); +} + +/* + * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE) + * by getting the DDP offset from the TCB. + */ +static void +tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m) +{ + struct ddp_state *q = &toep->tp_ddp_state; + struct ddp_buf_state *bsp; + struct cpl_get_tcb_rpl *hdr; + unsigned int ddp_offset; + struct socket *so; + struct tcpcb *tp; + + uint64_t t; + __be64 *tcb; + + + /* Note that we only accout for CPL_GET_TCB issued by the DDP code. We + * really need a cookie in order to dispatch the RPLs. + */ + q->get_tcb_count--; + + /* It is a possible that a previous CPL already invalidated UBUF DDP + * and moved the cur_buf idx and hence no further processing of this + * skb is required. However, the app might be sleeping on + * !q->get_tcb_count and we need to wake it up. + */ + if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) { + struct socket *so = toeptoso(toep); + + m_freem(m); + if (__predict_true((so->so_state & SS_NOFDREF) == 0)) + sorwakeup(so); + + return; + } + + bsp = &q->buf_state[q->cur_buf]; + hdr = cplhdr(m); + tcb = (__be64 *)(hdr + 1); + if (q->cur_buf == 0) { + t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]); + ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET); + } else { + t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]); + ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET; + } + ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET; + +#ifdef T3_TRACE + T3_TRACE3(TIDTB(so), + "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u", + tp->rcv_nxt, q->cur_buf, ddp_offset); +#endif + +#if 0 +{ + unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx; + + t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]); + ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS; + + t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]); + rcv_nxt = t >> S_TCB_RCV_NXT; + rcv_nxt &= M_TCB_RCV_NXT; + + t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]); + rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET); + rx_hdr_offset &= M_TCB_RX_HDR_OFFSET; + + T3_TRACE2(TIDTB(sk), + "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x", + ddp_flags, rcv_nxt - rx_hdr_offset); + T3_TRACE4(TB(q), + "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u", + tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf); + T3_TRACE3(TB(q), + "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u", + rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset); + T3_TRACE2(TB(q), + "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x", + q->buf_state[0].flags, q->buf_state[1].flags); + +} +#endif + m->m_cur_offset = bsp->cur_offset; + bsp->cur_offset = ddp_offset; + m->m_pkthdr.len = ddp_offset - m->m_cur_offset; + so = toeptoso(toep); + + if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) { + handle_excess_rx(toep, m); + return; + } + +#ifdef T3_TRACE + if ((int)m->m_pkthdr.len < 0) { + t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len"); + } +#endif + if (bsp->flags & DDP_BF_NOCOPY) { +#ifdef T3_TRACE + T3_TRACE0(TB(q), + "tcb_rpl_as_ddp_complete: CANCEL UBUF"); + + if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + printk("!cancel_ubuf"); + t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf"); + } +#endif + m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1; + bsp->flags &= ~DDP_BF_NOCOPY; + q->cur_buf ^= 1; + } else if (bsp->flags & DDP_BF_NOFLIP) { + + m->m_ddp_flags = 1; /* always a kernel buffer */ + + /* now HW buffer carries a user buffer */ + bsp->flags &= ~DDP_BF_NOFLIP; + bsp->flags |= DDP_BF_NOCOPY; + + /* It is possible that the CPL_GET_TCB_RPL doesn't indicate + * any new data in which case we're done. If in addition the + * offset is 0, then there wasn't a completion for the kbuf + * and we need to decrement the posted count. + */ + if (m->m_pkthdr.len == 0) { + if (ddp_offset == 0) + q->kbuf_posted--; + panic("length not set"); + m_free(m); + return; + } + } else { + /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP, + * but it got here way late and nobody cares anymore. + */ + m_free(m); + return; + } + + tp = toep->tp_tp; + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt += m->m_pkthdr.len; + tp->t_rcvtime = ticks; + +#if 0 + skb->h.th = tcphdr_skb->h.th; +#endif +#ifdef T3_TRACE + T3_TRACE3(TB(q), + "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u", + m->m_seq, q->cur_buf, m->m_pkthdr.len); +#endif +#ifdef notyet + __skb_queue_tail(&sk->sk_receive_queue, skb); +#endif + if (__predict_true((so->so_state & SS_NOFDREF) == 0)) + sorwakeup(so); +} + +/* + * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code, + * in that case they are similar to DDP completions. + */ +static int +do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx) +{ + struct toepcb *toep = (struct toepcb *)ctx; + + /* OK if socket doesn't exist */ + if (toep == NULL) + return (CPL_RET_BUF_DONE); + + tcb_rpl_as_ddp_complete(toep, m); + + return (0); +} + +static void +handle_ddp_data(struct toepcb *toep, struct mbuf *m) +{ + struct tcpcb *tp = toep->tp_tp; + struct ddp_state *q; + struct ddp_buf_state *bsp; + struct cpl_rx_data *hdr = cplhdr(m); + unsigned int rcv_nxt = ntohl(hdr->seq); + + if (tp->rcv_nxt == rcv_nxt) + return; + + q = &toep->tp_ddp_state; + bsp = &q->buf_state[q->cur_buf]; + m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt; + +#ifdef T3_TRACE + if ((int)m->m_pkthdr.len < 0) { + t3_ddp_error(so, "handle_ddp_data: neg len"); + } +#endif + + m->m_ddp_gl = (unsigned char *)bsp->gl; + m->m_cur_offset = bsp->cur_offset; + m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1; + if (bsp->flags & DDP_BF_NOCOPY) + bsp->flags &= ~DDP_BF_NOCOPY; + + m->m_seq = tp->rcv_nxt; + tp->rcv_nxt = rcv_nxt; + bsp->cur_offset += m->m_pkthdr.len; + if (!(bsp->flags & DDP_BF_NOFLIP)) + q->cur_buf ^= 1; + tp->t_rcvtime = ticks; +#ifdef notyet + __skb_queue_tail(&sk->sk_receive_queue, skb); +#endif + /* For now, don't re-enable DDP after a connection fell out of DDP + * mode. + */ + q->ubuf_ddp_ready = 0; +} + +/* * Process new data received for a connection. */ static void @@ -1477,26 +1732,25 @@ INP_LOCK(tp->t_inpcb); -#ifdef notyet - if (__predict_false(sk_no_receive(sk))) { - handle_excess_rx(so, skb); + if (__predict_false(so_no_receive(so))) { + handle_excess_rx(toep, m); return; } - if (ULP_MODE(tp) == ULP_MODE_TCPDDP) - handle_ddp_data(so, skb); + if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) + handle_ddp_data(toep, m); + + m->m_seq = ntohl(hdr->seq); + m->m_ddp_flags = 0; + m->m_ulp_mode = 0; /* for iSCSI */ - TCP_SKB_CB(skb)->seq = ntohl(hdr->seq); - TCP_SKB_CB(skb)->flags = 0; - skb_ulp_mode(skb) = 0; /* for iSCSI */ -#endif #if VALIDATE_SEQ - if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) { - printk(KERN_ERR + if (__predict_false(m->m_seq != tp->rcv_nxt)) { + log(LOG_ERR, "%s: TID %u: Bad sequence number %u, expected %u\n", - TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq, + TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq, tp->rcv_nxt); - __kfree_skb(skb); + m_freem(m); return; } #endif @@ -1528,8 +1782,8 @@ toep->tp_enqueued_bytes += m->m_pkthdr.len; #ifdef T3_TRACE T3_TRACE2(TIDTB(sk), - "new_rx_data: seq 0x%x len %u", - TCP_SKB_CB(skb)->seq, skb->len); + "new_rx_data: seq 0x%x len %u", + m->m_seq, m->m_pkthdr.len); #endif SOCKBUF_LOCK(&so->so_rcv); if (sb_notify(&so->so_rcv)) @@ -1567,21 +1821,20 @@ } static void -new_rx_data_ddp(struct socket *so, struct mbuf *m) +new_rx_data_ddp(struct toepcb *toep, struct mbuf *m) { - struct tcpcb *tp = sototcpcb(so); - struct toepcb *toep = tp->t_toe; + struct tcpcb *tp; struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_data_ddp *hdr; unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx; + struct socket *so = toeptoso(toep); -#ifdef notyet - if (unlikely(sk_no_receive(sk))) { - handle_excess_rx(so, m); + if (__predict_false(so_no_receive(so))) { + handle_excess_rx(toep, m); return; } -#endif + tp = sototcpcb(so); q = &toep->tp_ddp_state; hdr = cplhdr(m); @@ -1604,7 +1857,7 @@ rcv_nxt = ntohl(hdr->seq) + ddp_len; /* - * Overload to store old rcv_next + * Overload to store old RCV_NXT */ m->m_pkthdr.csum_data = tp->rcv_nxt; tp->rcv_nxt = rcv_nxt; @@ -1622,15 +1875,8 @@ * account for page pod's pg_offset. */ end_offset = G_DDP_OFFSET(ddp_report) + ddp_len; -#ifdef notyet - TCP_SKB_CB(skb)->when = end_offset - skb->len; - - /* - * We store in mac.raw the address of the gather list where the - * placement happened. - */ - skb->mac.raw = (unsigned char *)bsp->gl; -#endif + m->m_cur_offset = end_offset - m->m_pkthdr.len; + m->m_ddp_gl = (unsigned char *)bsp->gl; bsp->cur_offset = end_offset; /* @@ -1638,9 +1884,6 @@ * Note that other parts of the code depend on this being in bit 0. */ if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) { -#if 0 - TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */ -#endif panic("spurious ddp completion"); } else { m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE); @@ -1676,7 +1919,6 @@ do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = ctx; - struct socket *so = toeptoso(toep); const struct cpl_rx_data_ddp *hdr = cplhdr(m); VALIDATE_SOCK(so); @@ -1689,26 +1931,25 @@ #if 0 skb->h.th = tcphdr_skb->h.th; #endif - new_rx_data_ddp(so, m); + new_rx_data_ddp(toep, m); return (0); } static void -process_ddp_complete(struct socket *so, struct mbuf *m) +process_ddp_complete(struct toepcb *toep, struct mbuf *m) { - struct tcpcb *tp = sototcpcb(so); - struct toepcb *toep = tp->t_toe; + struct tcpcb *tp = toep->tp_tp; + struct socket *so = toeptoso(toep); struct ddp_state *q; struct ddp_buf_state *bsp; struct cpl_rx_ddp_complete *hdr; unsigned int ddp_report, buf_idx, when; -#ifdef notyet - if (unlikely(sk_no_receive(sk))) { - handle_excess_rx(sk, skb); + if (__predict_false(so_no_receive(so))) { + handle_excess_rx(toep, m); return; } -#endif + q = &toep->tp_ddp_state; hdr = cplhdr(m); ddp_report = ntohl(hdr->ddp_report); @@ -1748,11 +1989,11 @@ tp->rcv_nxt += m->m_len; tp->t_rcvtime = ticks; - sbappendstream_locked(&so->so_rcv, m); -#ifdef notyet - if (!sock_flag(sk, SOCK_DEAD)) - sk->sk_data_ready(sk, 0); -#endif + sbappendstream_locked(&so->so_rcv, m) + ; + if ((so->so_state & SS_NOFDREF) == 0) + sorwakeup_locked(so); + } /* @@ -1762,13 +2003,12 @@ do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx) { struct toepcb *toep = ctx; - struct socket *so = toeptoso(toep); VALIDATE_SOCK(so); #if 0 skb->h.th = tcphdr_skb->h.th; #endif - process_ddp_complete(so, m); + process_ddp_complete(toep, m); return (0); } @@ -3413,8 +3653,8 @@ #ifdef notyet t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify); t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt); +#endif t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl); -#endif return (0); } ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#4 (text+ko) ==== @@ -1001,6 +1001,38 @@ (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN); } + +static inline int +is_ddp(const struct mbuf *m) +{ + return (m->m_flags & M_DDP); +} + +static inline int +is_ddp_psh(const struct mbuf *m) +{ + return is_ddp(skb) && (m->m_pkthdr.csum_flags & DDP_BF_PSH); +} + +/* + * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the + * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a + * DDP buffer. + */ +static inline int +copy_data(const struct mbuf *m, int offset, struct iovec *to, int len) +{ + if (__predict_true(!is_ddp(m))) /* RX_DATA */ + return mbuf_copy_datagram_iovec(m, offset, to, len); + if (__predict_true(m->pkthdr.csum_flags & DDP_BF_NOCOPY)) { /* user DDP */ + to->iov_len -= len; + to->iov_base += len; + return 0; + } + return t3_ddp_copy(m, offset, to, len); /* kernel DDP */ +} + + #endif /* * Clean up DDP state that needs to survive until socket close time, such as the @@ -1014,9 +1046,6 @@ struct ddp_state *p = &toep->tp_ddp_state; int idx; - if (!p) - return; - for (idx = 0; idx < NUM_DDP_KBUF; idx++) if (p->kbuf[idx]) { ddp_gl_free_pages(p->kbuf[idx], 0); @@ -1026,6 +1055,7 @@ if (p->ubuf) { ddp_gl_free_pages(p->ubuf, 0); free(p->ubuf, M_DEVBUF); + p->ubuf = NULL; } toep->tp_ulp_mode = 0; } ==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#4 (text+ko) ==== @@ -135,9 +135,8 @@ /* * Returns 1 if a UBUF DMA buffer might be active. */ -static inline int t3_ddp_ubuf_pending(struct socket *so) +static inline int t3_ddp_ubuf_pending(struct toepcb *toep) { - struct toepcb *toep = sototcpcb(so)->t_toe; struct ddp_state *p = &toep->tp_ddp_state; /* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP,