Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 22 Aug 1998 22:30:01 -0700 (PDT)
From:      Matt Dillon <dillon@best.net>
To:        freebsd-bugs@FreeBSD.ORG
Subject:   Re: kern/7557: -current machine running Diablo, lockup, possible inode deadlock
Message-ID:  <199808230530.WAA28407@freefall.freebsd.org>

next in thread | raw e-mail | index | archive | help
The following reply was made to PR kern/7557; it has been noted by GNATS.

From: Matt Dillon <dillon@best.net>
To: freebsd-gnats-submit@freebsd.org
Cc:  Subject: Re: kern/7557: -current machine running Diablo, lockup, possible inode deadlock
Date: Sat, 22 Aug 1998 22:24:25 -0700 (PDT)

     More information on deadlock.  I have two full-debug crash dumps.  It's
     hard to track things down, but both crash dumps have an interesting
     commonality.  In both instances, in addition to all the process stuck
     in inode locks, there is a process stuck in getblk AND a process stuck
     in pgtblk.  the getblk wait occurs if bp->b_Flags & B_BUSY, and the
     pgtblk wait occurs if (a vm_page_t) m->flags & BG_BUSY occurs.
 
 
 nntp3:/var/crash# ps -M vmcore.6 -N kernel.6 -axl | less
   UID   PID  PPID CPU PRI NI   VSZ  RSS WCHAN  STAT  TT       TIME COMMAND
     ...
    8 10400   198   0  -2  0 43780    0 getblk D     ??    0:00.00  (diablo)
    8 10419   198   0 -18  0 43788    0 pgtblk D     ??    0:00.00  (diablo)
 
 nntp3:/var/crash# ps -M vmcore.7 -N kernel.7 -axl | less
   UID   PID  PPID CPU PRI NI   VSZ  RSS WCHAN  STAT  TT       TIME COMMAND
      ...
    8   319   198   1  -2  0 44312    0 getblk D     ??    0:00.00  (diablo)
    8   280   198   1 -18  0 44312    0 pgtblk D     ??    0:00.00  (diablo)
 
 (kgdb) proc 319
 (kgdb) back
 #0  mi_switch () at ../../kern/kern_synch.c:661
 #1  0xf0119fb1 in tsleep (ident=0xf6e400b0, priority=0x14, 
     wmesg=0xf013432f "getblk", timo=0x0) at ../../kern/kern_synch.c:435
 #2  0xf01343dd in getblk (vp=0xfc24d180, blkno=0xc, size=0x2000, slpflag=0x0, 
     slptimeo=0x0) at ../../kern/vfs_bio.c:1437
 #3  0xf01366fb in cluster_read (vp=0xfc24d180, filesize=0x4034b2, lblkno=0xc, 
     size=0x2000, cred=0x0, totread=0x10000, seqcount=0x8, bpp=0xfc10fd60)
     at ../../kern/vfs_cluster.c:114
 #4  0xf01ac721 in ffs_read (ap=0xfc10fe18) at ../../ufs/ufs/ufs_readwrite.c:168
 #5  0xf01ad2bd in ffs_getpages (ap=0xfc10fe70) at vnode_if.h:303
 #6  0xf01c386a in vnode_pager_getpages (object=0xfc3f0220, m=0xfc10ff1c, 
     count=0x2, reqpage=0x0) at vnode_if.h:1067
 #7  0xf01c2587 in vm_pager_get_pages (object=0xfc3f0220, m=0xfc10ff1c, 
     count=0x2, reqpage=0x0) at ../../vm/vm_pager.c:256
 #8  0xf01b6f34 in vm_fault (map=0xfc073380, vaddr=0x22897000, fault_type=0x1, 
     fault_flags=0x0) at ../../vm/vm_fault.c:424
 #9  0xf01daca2 in trap_pfault (frame=0xfc10ffbc, usermode=0x1)
     at ../../i386/i386/trap.c:753
 #10 0xf01da7e3 in trap (frame={tf_es = 0xefbf0027, tf_ds = 0xfc100027, 
       tf_edi = 0x1, tf_esi = 0x17fea, tf_ebp = 0xefbfd58c, 
       tf_isp = 0xfc10ffe4, tf_ebx = 0x18000, tf_edx = 0x2287f000, 
       tf_ecx = 0x0, tf_eax = 0x9cf7f, tf_trapno = 0xc, tf_err = 0x4, 
       tf_eip = 0x414c, tf_cs = 0x1f, tf_eflags = 0x10297, tf_esp = 0xefbfd520, 
       tf_ss = 0x27}) at ../../i386/i386/trap.c:317
 #11 0x414c in ?? ()
 #12 0x276e in ?? ()
 #13 0x1ee1 in ?? ()
 #14 0x1809 in ?? ()
 #15 0x107e in ?? ()
 (kgdb) frame 2
 #2  0xf01343dd in getblk (vp=0xfc24d180, blkno=0xc, size=0x2000, slpflag=0x0, 
     slptimeo=0x0) at ../../kern/vfs_bio.c:1437
 1437                            if (!tsleep(bp,
 (kgdb) print bp
 $11 = (struct buf *) 0xf6e400b0
 (kgdb) print *bp
 $12 = {
   b_hash = {
     le_next = 0x0, 
     le_prev = 0xf6e2f0f8
   }, 
   b_vnbufs = {
     le_next = 0xf6e4f258, 
     le_prev = 0xfc24d1b0
   }, 
   b_freelist = {
     tqe_next = 0xf6d77f08, 
     tqe_prev = 0xf0202158
   }, 
   b_act = {
     tqe_next = 0x0, 
     tqe_prev = 0xf1ca0e14
   }, 
   b_proc = 0x0, 
   b_flags = 0x20800030, 
   b_qindex = 0x0, 
   b_usecount = 0x6, 
   b_error = 0x0, 
   b_bufsize = 0x0, 
   b_bcount = 0x0, 
   b_resid = 0x0, 
   b_dev = 0xffffffff, 
   b_data = 0xf95ae000 <Address 0xf95ae000 out of bounds>, 
   b_kvabase = 0xf95ae000 <Address 0xf95ae000 out of bounds>, 
   b_kvasize = 0x2000, 
   b_lblkno = 0xc, 
   b_blkno = 0xc, 
   b_offset = 0x0000000000018000, 
   b_iodone = 0, 
   b_iodone_chain = 0x0, 
   b_vp = 0xfc24d180, 
   b_dirtyoff = 0x0, 
   b_dirtyend = 0x0, 
   b_rcred = 0x0, 
   b_wcred = 0x0, 
   b_validoff = 0x0, 
   b_validend = 0x0, 
   b_pblkno = 0x9804d0, 
   b_saveaddr = 0x0, 
   b_savekva = 0x0, 
   b_driver1 = 0x0, 
   b_driver2 = 0x0, 
   b_spc = 0x0, 
   b_cluster = {
     cluster_head = {
       tqh_first = 0xf6d77f08, 
       tqh_last = 0xf6d850e8
     }, 
     cluster_entry = {
       tqe_next = 0xf6d77f08, 
       tqe_prev = 0xf6d850e8
     }
   }, 
   b_pages = {0x0 <repeats 32 times>}, 
   b_npages = 0x0, 
   b_dep = {
     lh_first = 0x0
   }
 }
 (kgdb) proc 280
 (kgdb) back
 #0  mi_switch () at ../../kern/kern_synch.c:661
 #1  0xf0119fb1 in tsleep (ident=0xf0e19ba0, priority=0x4, 
     wmesg=0xf01346d2 "pgtblk", timo=0x0) at ../../kern/kern_synch.c:435
 #2  0xf0134afa in allocbuf (bp=0xf6e400b0, size=0x2000)
     at ../../kern/vfs_bio.c:1799
 #3  0xf0134612 in getblk (vp=0xfc24d180, blkno=0xc, size=0x2000, slpflag=0x0, 
     slptimeo=0x0) at ../../kern/vfs_bio.c:1557
 #4  0xf0136a5e in cluster_read (vp=0xfc24d180, filesize=0x4034b2, lblkno=0xc, 
     size=0x2000, cred=0x0, totread=0xc000, seqcount=0x8, bpp=0xfc0f6d60)
     at ../../kern/vfs_cluster.c:235
 #5  0xf01ac721 in ffs_read (ap=0xfc0f6e18) at ../../ufs/ufs/ufs_readwrite.c:168
 #6  0xf01ad2bd in ffs_getpages (ap=0xfc0f6e70) at vnode_if.h:303
 #7  0xf01c386a in vnode_pager_getpages (object=0xfc3f0220, m=0xfc0f6f1c, 
     count=0x2, reqpage=0x0) at vnode_if.h:1067
 #8  0xf01c2587 in vm_pager_get_pages (object=0xfc3f0220, m=0xfc0f6f1c, 
     count=0x2, reqpage=0x0) at ../../vm/vm_pager.c:256
 #9  0xf01b6f34 in vm_fault (map=0xfc0738c0, vaddr=0x22891000, fault_type=0x1, 
     fault_flags=0x0) at ../../vm/vm_fault.c:424
 #10 0xf01daca2 in trap_pfault (frame=0xfc0f6fbc, usermode=0x1)
     at ../../i386/i386/trap.c:753
 #11 0xf01da7e3 in trap (frame={tf_es = 0xefbf0027, tf_ds = 0xfc0f0027, 
       tf_edi = 0x1, tf_esi = 0x11fd1, tf_ebp = 0xefbfd58c, 
       tf_isp = 0xfc0f6fe4, tf_ebx = 0x12000, tf_edx = 0x2287f000, 
       tf_ecx = 0x0, tf_eax = 0x9cf7f, tf_trapno = 0xc, tf_err = 0x4, 
       tf_eip = 0x414c, tf_cs = 0x1f, tf_eflags = 0x10297, tf_esp = 0xefbfd520, 
       tf_ss = 0x27}) at ../../i386/i386/trap.c:317
 #12 0x414c in ?? ()
 #13 0x276e in ?? ()
 #14 0x1ee1 in ?? ()
 #15 0x1809 in ?? ()
 #16 0x107e in ?? ()
 (kgdb) 
 (kgdb) print bp
 $14 = (struct buf *) 0xf6e400b0		(this is the same bp)
 
     The deadlock doesn't occur here, but it seems odd.  It's impossible to 
     determine what is causing the deadlock.  If I track down inode lock chains
     I always get to a process that is waiting for an exclusive lock on a 
     shared-locked inode (with 2 references even!).   I cannot determine who
     is holding the shared lock(s) to track it down further.
 
 inode f1d27400  lock holder 0xCC pid 204
   204 fbfb25c0 fc01f000   8   200   200  000105  S  dreaderd     inode f1d27200
 inode f1d27200  lock holder 0xFC pid 252
   252 fc098d40 fc0da000   8   198   198  000105  S  diablo       inode f1d2ca00
 inode f1d2ca00  lock holder 0xFE pid 254
   254 fc098ac0 fc0e2000   8   198   198  000105  S  diablo       inode f1d58000
         lock holder 0xF1 241
   241 fc0999c0 fc0b3000   8   198   198  000105  S  diablo       inode f1f25000
         lock holder 0x13e (318)
   318 fc097d00 fc10b000   8   198   198  000105  S  diablo       inode f21e6e00
         shared lock, share cnt 2, waitcnt 1
 
 	here I'm stuck.  I can't tell who is holding the shared lock on
 	inode f21e6e00.  However, this inode is associated with vnode 
 	fc24d180 which happens to be the vnode under which the bp exists 
   	that is stuck in pgtblk and getblk in the other two processes,
 	so maybe they are the ones holding the shared lock.
 
 	Whatever the case, the result is a chain reaction that puts just
 	about every process in the system in an inode wait state.
 
 						-Matt
 
     Matthew Dillon  Engineering, HiWay Technologies, Inc. & BEST Internet
                     Communications.
     <dillon@best.net> (Please include portions of article in any response)

To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-bugs" in the body of the message



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?199808230530.WAA28407>