Skip site navigation (1)Skip section navigation (2)
Date:      Wed, 29 Sep 1999 23:09:36 -0400 (EDT)
From:      Bosko.Milekic@jehovah.technokratis.com (bmilekic@dsuper.net)
To:        FreeBSD-gnats-submit@freebsd.org
Subject:   kern/14042: mbuf shortage and calls with M_WAIT 
Message-ID:  <199909300309.XAA00879@jehovah.technokratis.com>

next in thread | raw e-mail | index | archive | help

>Number:         14042
>Category:       kern
>Synopsis:       mbuf shortage and allocation with M_WAIT results in a panic
>Confidential:   no
>Severity:       serious
>Priority:       high
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Sep 29 20:10:00 PDT 1999
>Closed-Date:
>Last-Modified:
>Originator:     Bosko Milekic bmilekic@dsuper.net
>Release:        FreeBSD 3.3-STABLE i386
>Organization:
(None) 
>Environment:

$ uname -v
FreeBSD 3.3-STABLE #1: Wed Sep 29 19:35:08 EDT 1999     bmilekic@jehovah.technokratis.com

>Description:

Upon starvation of the mb_map arena, we solely rely on freed (recycled) mbufs
and mbuf clusters since nothing is every freed back to the map.
When no free mbuf is available, if we're M_DONTWAIT, a null is returned (this
is expected behaviour). However, if we're calling with M_WAIT and there's
nothing available, a call to the protocol drain routines is made and if after
that point we still have nothing available, we panic().
When no free mbuf cluster is available, whether we call M_DONTWAIT or M_WAIT
makes no difference in the case of mb_map already being starved and a failure
is immediately reported.

>How-To-Repeat:

There are several ways to reproduce this problem. An obvious [local] way to
do this would be to sockopt large socket buffer space. Eventually, we will run 
out of mbufs and panic.

>Fix:
	
The diffs provided below add functions that will sleep for a predefined amount
of time waiting for an mbuf (or mbuf cluster) to be freed. In the case of a
failure even after the sleep, the callers to the allocation routines should
check for a null being returned. These diffs make sure that the check is  
performed, and if the failure is detected, the calling routines themselves 
would normally return ENOBUFS. The applications in userland generating the
system calls that eventually end up calling the allocation routines (e.g.
sosend() ) should check for the possiblity of the ENOBUFS and handle the
situation appropriately.

These diffs are for -STABLE but if the decision is made for them to be
commited, I will gladly make sure that they function and not interfere
with other work that made be in progress on this part of the code for
what concerns -CURRENT.

The first diff applies to sys/kern whereas the second applies to sys/sys/mbuf.h

--8<--[mbuf.patch]--8<--

diff -ruN /usr/src/sys/kern.old/uipc_mbuf.c /usr/src/sys/kern/uipc_mbuf.c
--- /usr/src/sys/kern.old/uipc_mbuf.c   Wed Sep  8 20:45:50 1999
+++ /usr/src/sys/kern/uipc_mbuf.c       Wed Sep 29 19:31:37 1999
@@ -47,6 +47,10 @@
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
+#ifdef INVARIANTS
+#include <machine/cpu.h>
+#endif
+
 static void mbinit __P((void *));
 SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
 
@@ -60,6 +64,8 @@
 int    max_hdr;
 int    max_datalen;
 
+static u_int m_mballoc_wid = 0, m_clalloc_wid = 0;
+
 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
           &max_linkhdr, 0, "");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
@@ -73,9 +79,12 @@
 
 /* "number of clusters of pages" */
 #define NCL_INIT       1
-
 #define NMB_INIT       16
 
+/* This sleep time definition/hz will determine the duration of the
+   mbuf and mbuf cluster "wait" sleeps if exhaustion occurs. */
+#define M_SLEEP_TIME   32
+
 /* ARGSUSED*/
 static void
 mbinit(dummy)
@@ -153,6 +162,50 @@
        return (1);
 }
 
+/*
+ * For callers with M_WAIT (who prefer -- and can afford -- to wait
+ * a little longer as opposed to immediately return null). 
+ */
+struct mbuf *
+m_mballoc_wait(caller, type)
+       int caller;
+       u_short type;
+{
+       struct mbuf *p;
+
+       /* Sleep until something's available or until we expire. */
+       m_mballoc_wid++;
+       if ( (tsleep(&m_mballoc_wid, PVM, "mballc",
+           M_SLEEP_TIME)) == EWOULDBLOCK)
+               m_mballoc_wid--;
+       
+       /*
+        * Now that we (think) that we've got something, we will redo an
+        * MGET, but avoid getting into another instance of m_mballoc_wait()
+        */
+#define m_mballoc_wait(caller,type)    (struct mbuf *)0 
+       if (caller == MGET_C) {
+               MGET(p, M_WAIT, type);
+       } else {
+               MGETHDR(p, M_WAIT, type);
+       }
+#undef m_mballoc_wait
+ 
+       return (p);
+}
+
+/*
+ * Function used to wakeup sleepers waiting for mbufs...
+ */
+void
+m_mballoc_wakeup(void)
+{
+       if (m_mballoc_wid) {
+               m_mballoc_wid = 0;
+               wakeup(&m_mballoc_wid);
+       }
+}
+
 #if MCLBYTES > PAGE_SIZE
 static int i_want_my_mcl;
 
@@ -242,6 +295,46 @@
 }
 
 /*
+ * For callers with M_WAIT (who prefer -- and can afford -- to wait
+ * a little longer as opposed to immediately return null).
+ */
+caddr_t
+m_clalloc_wait(void)
+{
+       caddr_t p;
+
+       KASSERT(intr_nesting_level == 0,("CLALLOC: CANNOT WAIT IN INTERRUPT"));
+
+       /* Sleep until something's available or until we expire. */
+       m_clalloc_wid++;
+       if ( (tsleep(&m_clalloc_wid, PVM, "mclalc",
+           M_SLEEP_TIME)) == EWOULDBLOCK)
+               m_clalloc_wid--;
+       
+       /*
+        * Now that we (think) that we've got something, we will redo and
+        * MGET, but avoid getting into another instance of m_clalloc_wait()
+        */
+#define m_clalloc_wait()       (caddr_t)0
+       MCLALLOC(p,M_WAIT);
+#undef m_clalloc_wait
+
+       return (p);
+}
+
+/*
+ * Function used to wakeup sleepers waiting for mbuf clusters...
+ */
+void
+m_clalloc_wakeup(void)
+{
+       if (m_clalloc_wid) {
+               m_clalloc_wid = 0;
+               wakeup(&m_clalloc_wid);
+       }
+}
+
+/*
  * When MGET fails, ask protocols to free space when short of memory,
  * then re-attempt to allocate an mbuf.
  */
@@ -254,19 +347,27 @@
        /*
         * Must only do the reclaim if not in an interrupt context.
         */
-       if (i == M_WAIT)
+       if (i == M_WAIT) {
+               KASSERT(intr_nesting_level == 0,
+                   ("MBALLOC: CANNOT WAIT IN INTERRUPT"));
                m_reclaim();
+       }
+
+       /*
+        * XXX Be paranoid and define both as null. We don't want to wait
+        * from here.
+        */
+#define m_mballoc_wait(caller,type)    (struct mbuf *)0
 #define m_retry(i, t)  (struct mbuf *)0
        MGET(m, i, t);
 #undef m_retry
+#undef m_mballoc_wait
+
        if (m != NULL) {
                mbstat.m_wait++;
-       } else {
-               if (i == M_DONTWAIT)
-                       mbstat.m_drops++;
-               else
-                       panic("Out of mbuf clusters");
-       }
+       } else 
+               mbstat.m_drops++;
+
        return (m);
 }
 
@@ -284,17 +385,16 @@
         */
        if (i == M_WAIT)
                m_reclaim();
+#define m_mballoc_wait(caller,type) (struct mbuf *)0
 #define m_retryhdr(i, t) (struct mbuf *)0
        MGETHDR(m, i, t);
 #undef m_retryhdr
+#undef m_mballoc_wait
        if (m != NULL) {
                mbstat.m_wait++;
-       } else {
-               if (i == M_DONTWAIT)
-                       mbstat.m_drops++;
-               else
-                       panic("Out of mbuf clusters");
-       }
+       } else 
+               mbstat.m_drops++;
+
        return (m);
 }
 
diff -ruN /usr/src/sys/kern.old/uipc_socket.c /usr/src/sys/kern/uipc_socket.c
--- /usr/src/sys/kern.old/uipc_socket.c Wed Sep  8 20:45:50 1999
+++ /usr/src/sys/kern/uipc_socket.c     Sun Sep 26 19:33:42 1999
@@ -486,15 +486,21 @@
                    } else do {
                        if (top == 0) {
                                MGETHDR(m, M_WAIT, MT_DATA);
+                               if (m == NULL)
+                                       return ENOBUFS;
                                mlen = MHLEN;
                                m->m_pkthdr.len = 0;
                                m->m_pkthdr.rcvif = (struct ifnet *)0;
                        } else {
                                MGET(m, M_WAIT, MT_DATA);
+                               if (m == NULL)
+                                       return ENOBUFS;
                                mlen = MLEN;
                        }
                        if (resid >= MINCLSIZE) {
                                MCLGET(m, M_WAIT);
+                               if (m == NULL)
+                                       return ENOBUFS;
                                if ((m->m_flags & M_EXT) == 0)
                                        goto nopages;
                                mlen = MCLBYTES;
@@ -606,6 +612,8 @@
                flags = 0;
        if (flags & MSG_OOB) {
                m = m_get(M_WAIT, MT_DATA);
+               if (m == NULL)
+                       return ENOBUFS;
                error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
                if (error)
                        goto bad;
diff -ruN /usr/src/sys/kern.old/uipc_socket2.c /usr/src/sys/kern/uipc_socket2.c
--- /usr/src/sys/kern.old/uipc_socket2.c        Wed Sep  8 20:45:50 1999
+++ /usr/src/sys/kern/uipc_socket2.c    Sun Sep 26 16:17:56 1999
@@ -407,7 +407,15 @@
 sbrelease(sb)
        struct sockbuf *sb;
 {
-
+       /* 
+        * XXX If sockbuf is locked and we call sbflush, we will
+        * panic for sure, avoid that for now by just unlocking the
+        * sockbuf, so that sbflush can flush it, because if this is
+        * occuring, chances are we want to kill the process holding
+        * the sockbuf space anyway...
+        */
+       if (sb->sb_flags & SB_LOCK) 
+               sb->sb_flags &= ~SB_LOCK;
        sbflush(sb);
        sb->sb_hiwat = sb->sb_mbmax = 0;
 }
diff -ruN /usr/src/sys/kern.old/uipc_syscalls.c /usr/src/sys/kern/uipc_syscalls.
c
--- /usr/src/sys/kern.old/uipc_syscalls.c       Wed Sep  8 20:45:50 1999
+++ /usr/src/sys/kern/uipc_syscalls.c   Sun Sep 26 19:34:29 1999
@@ -1620,6 +1620,8 @@
                 * Get an mbuf header and set it up as having external storage.
                 */
                MGETHDR(m, M_WAIT, MT_DATA);
+               if (m == NULL)
+                       return ENOBUFS;
                m->m_ext.ext_free = sf_buf_free;
                m->m_ext.ext_ref = sf_buf_ref;
                m->m_ext.ext_buf = (void *)sf->kva;

--8<--[end]--8<--


--8<--[mbuf2.patch]--8<--

--- mbuf.h.orig Sun Sep 26 14:17:20 1999
+++ mbuf.h      Tue Sep 28 23:39:22 1999
@@ -153,6 +153,17 @@
 #define        M_DONTWAIT      1
 #define        M_WAIT          0
 
+/* 
+ * Flags to pass to the *_wait functions (when we have to wait for an
+ * mbuf to be freed).
+ * XXX this could theoretically also be implemented as a char (which would
+ * be SLIGHTLY less costly) that would be ORed with a specific 'pattern'
+ * that could represent each one of the options -- but it's kept this way
+ * for now for simplicity's sake.
+ */
+#define MGET_C         1
+#define MGETHDR_C      2
+
 /* Freelists:
  *
  * Normal mbuf clusters are normally treated as character arrays
@@ -203,7 +214,8 @@
                splx(_ms); \
        } else { \
                splx(_ms); \
-               (m) = m_retry((how), (type)); \
+               if (((m)=m_retry((how), (type)))==NULL && (how)==M_WAIT) \
+                       (m) = m_mballoc_wait(MGET_C,(type)); \
        } \
 }
 
@@ -223,7 +235,8 @@
                splx(_ms); \
        } else { \
                splx(_ms); \
-               (m) = m_retryhdr((how), (type)); \
+               if (((m)=m_retryhdr((how),(type)))==NULL && (how)==M_WAIT) \
+                       (m) = m_mballoc_wait(MGETHDR_C,(type)); \
        } \
 }
 
@@ -235,16 +248,20 @@
  * MCLFREE releases a reference to a cluster allocated by MCLALLOC,
  * freeing the cluster if the reference count has reached 0.
  */
-#define        MCLALLOC(p, how) \
-       MBUFLOCK( \
-         if (mclfree == 0) \
+#define        MCLALLOC(p, how) { \
+       int _ms = splimp(); \
+       if (mclfree == 0) \
                (void)m_clalloc(1, (how)); \
-         if (((p) = (caddr_t)mclfree) != 0) { \
+       if (((p) = (caddr_t)mclfree) != 0) { \
                ++mclrefcnt[mtocl(p)]; \
                mbstat.m_clfree--; \
                mclfree = ((union mcluster *)(p))->mcl_next; \
-         } \
-       )
+               splx(_ms); \
+       } else if ((how) == M_WAIT) { \
+               splx(_ms); \
+               (p) = m_clalloc_wait(); \
+       } \
+}
 
 #define        MCLGET(m, how) \
        { MCLALLOC((m)->m_ext.ext_buf, (how)); \
@@ -263,6 +280,7 @@
                ((union mcluster *)(p))->mcl_next = mclfree; \
                mclfree = (union mcluster *)(p); \
                mbstat.m_clfree++; \
+               (void)m_clalloc_wakeup(); \
          } \
        )
 
@@ -284,6 +302,7 @@
                                ((union mcluster *)(p))->mcl_next = mclfree; \
                                mclfree = (union mcluster *)(p); \
                                mbstat.m_clfree++; \
+                               (void)m_clalloc_wakeup(); \
                        } \
                } \
          } \
@@ -292,6 +311,7 @@
          mbstat.m_mtypes[MT_FREE]++; \
          (m)->m_next = mmbfree; \
          mmbfree = (m); \
+         (void)m_mballoc_wakeup(); \
        )
 
 /*
@@ -408,16 +428,20 @@
 struct mbuf *m_gethdr __P((int, int));
 struct mbuf *m_prepend __P((struct mbuf *,int,int));
 struct mbuf *m_pullup __P((struct mbuf *, int));
+struct mbuf *m_mballoc_wait __P((int,u_short));
 struct mbuf *m_retry __P((int, int));
 struct mbuf *m_retryhdr __P((int, int));
 struct mbuf *m_split __P((struct mbuf *,int,int));
 void   m_adj __P((struct mbuf *, int));
 void   m_cat __P((struct mbuf *,struct mbuf *));
+void   m_mballoc_wakeup __P((void));
+void   m_clalloc_wakeup __P((void));
 int    m_mballoc __P((int, int));
 int    m_clalloc __P((int, int));
 void   m_copyback __P((struct mbuf *, int, int, caddr_t));
 void   m_copydata __P((struct mbuf *,int,int,caddr_t));
 void   m_freem __P((struct mbuf *));
+caddr_t        m_clalloc_wait __P((void));
 #endif /* KERNEL */
 
 #endif /* !_SYS_MBUF_H_ */

--8<--[end]--8<--

There are still very few calls to the allocation routines and macros from
some of the net module code (net/, netinet/, etc.) with M_WAIT. Most of
these calls check for the possibility of error... a few still remain. From
what I've been able to see, what remains is not much and can be remedied
fairly quickly (I just never got around to it). Again, if interest arises,
I will gladly go over that as well -- these patches, however, take care of
the main problem.

--Bosko M.


>Release-Note:
>Audit-Trail:
>Unformatted:


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199909300309.XAA00879>