Skip site navigation (1)Skip section navigation (2)
Date:      Sat, 28 Sep 1996 01:09:25 -0700
From:      Julian Elischer <julian@whistle.com>
To:        current@freebsd.org
Subject:   Locking snafu in -current
Message-ID:  <324CDD35.167EB0E7@whistle.com>

next in thread | raw e-mail | index | archive | help
This seems to have been present for a long time..

massively 'overthrashed machine eventually hangs as all processes
end up stopped in ufslk2 wait or flock wait
(according to /proc or top)

My test system appears to be going that way now but hasn't 
totally stopped.

the major waiters seem to have the following stack trqace:
#0  ufs_lock (ap=0xefbffeb8) at ../../ufs/ufs/ufs_vnops.c:1699
#1  0xf01ae5d1 in vnode_pager_lock (object=0xf0a99980) at
./vnode_if.h:731
#2  0xf01a141c in vm_fault (map=0xf0a6b900, vaddr=134692864,
fault_type=1 '\001', change_wiring=0) at ../../vm/vm_fault.c:203
#3  0xf01baffc in trap_pfault (frame=0xefbfffbc, usermode=1) at
../../i386/i386/trap.c:632
#4  0xf01bab87 in trap (frame={tf_es = 134938663, tf_ds = 39, tf_edi =
369511, tf_esi = 3, tf_ebp = -272641892, tf_isp = -272629788, tf_ebx =
369500, tf_edx = 9, tf_ecx = 0, tf_eax = -272642295, tf_trapno = 12,
tf_err = 4, tf_eip = 134693728, tf_cs = 31, tf_eflags = 66118, tf_esp =
-272642240, tf_ss = 39}) at ../../i386/i386/trap.c:239
#5  0xf01b33c7 in calltrap ()
though this may be a red-herring
as I can't tell which
of these processes might eventually wake up and which are
hung..

several dozen processes at a time might suddenly be waiting on some ufs
lock
but at some stage
it appears that the process that has the lock dies and everything
stops..
at this stage the system is stil alive but I have the following dump of
/proc/*/status:
[frame1.whistle.com] 23tcat /proc/*/status
swapper 0 0 0 0 -1,-1 sldr 843891950,0 0,0 0,969130 sched 0 0 0,0
init 1 0 1 1 -1,-1 sldr -1,-1 0,84674 0,299617 wait 0 0 0,0
tcsh 106 1 106 106 12,0 ctty,sldr -1,-1 1,190232 4,322808 ttyin 0 0
0,0,0,2,3,4,5,20,31
filterd 112 1 112 112 -1,-1 sldr -1,-1 0,170151 0,226868 pause 0 0 0,0
gated 120 1 120 0 -1,-1 noflags 843891998,981426 1,700206 17,507528
vnread 0 0 0,0
sniffd 127 1 127 127 -1,-1 sldr 843892001,572723 50,462111 130,962864
nochan 0 0 0,0
named 135 1 135 135 -1,-1 sldr -1,-1 0,116476 13,992701 select 0 0 0,0
httpd 146 1 146 146 -1,-1 sldr 843892016,688153 1,376891 5,499697 wait 0
0 0,0,0,2,3,4,5,20,31
httpd 149 146 146 146 -1,-1 noflags -1,-1 0,689781 13,521276 accept 0 0
0,0,0,2,3,4,5,20,31
httpd 151 146 146 146 -1,-1 noflags -1,-1 0,349235 5,774020 accept 0 0
0,0,0,2,3,4,5,20,31
ftpd 161 1 161 161 -1,-1 sldr -1,-1 0,11453 0,17179 accept 0 0 0,0
sendmail 177 1 177 177 -1,-1 sldr -1,-1 0,340166 12,616397 accept 0 0
0,0
sendmail 178 177 177 177 -1,-1 noflags -1,-1 0,362334 1,630503 lockf 0 0
0,0
pagedaemon 2 0 0 0 -1,-1 noflags 843891950,16727 0,0 261,130720 psleep 0
0 0,0
atalkd 200 1 200 0 -1,-1 noflags 843892047,403006 1,506592 11,314974
select 0 0 0,0
afpd 213 1 195 0 -1,-1 noflags -1,-1 0,522152 8,346526 sbwait 0 0 0,0
smbd 221 1 221 221 -1,-1 sldr -1,-1 0,0 0,29129 accept 0 0 0,0
nmbd 223 1 223 223 -1,-1 sldr 843892093,987846 1,329655 9,722624 select
0 0 0,0
sendmail 230 177 177 177 -1,-1 noflags -1,-1 0,273136 1,451527 lockf 0 0
0,0
sendmail 235 177 177 177 -1,-1 noflags -1,-1 0,307113 1,323513 lockf 0 0
0,0
sendmail 237 177 177 177 -1,-1 noflags -1,-1 0,293162 1,311518 lockf 0 0
0,0
sendmail 240 177 177 177 -1,-1 noflags -1,-1 0,317234 1,284042 lockf 0 0
0,0
sendmail 245 177 177 177 -1,-1 noflags -1,-1 0,290719 1,406453 lockf 0 0
0,0
sendmail 248 177 177 177 -1,-1 noflags -1,-1 0,253944 1,331282 lockf 0 0
0,0
sendmail 256 177 177 177 -1,-1 noflags -1,-1 0,319580 1,366204 lockf 0 0
0,0
sendmail 258 177 177 177 -1,-1 noflags -1,-1 0,357550 1,298063 lockf 0 0
0,0
sendmail 260 177 177 177 -1,-1 noflags -1,-1 0,324134 1,335124 lockf 0 0
0,0
getty 262 1 262 262 28,0 ctty,sldr -1,-1 0,64743 0,56650 ttyin 0 0 0,0
httpd 268 146 146 146 -1,-1 noflags -1,-1 0,512358 6,324419 accept 0 0
0,0,0,2,3,4,5,20,31
httpd 269 146 146 146 -1,-1 noflags -1,-1 0,442213 14,11181 accept 0 0
0,0,0,2,3,4,5,20,31
sendmail 283 177 177 177 -1,-1 noflags -1,-1 1,205100 12,560853 lockf 0
0 0,0
vmdaemon 3 0 0 0 -1,-1 noflags 843891950,18048 0,0 15,822656 psleep 0 0
0,0
httpd 308 146 146 146 -1,-1 noflags -1,-1 0,218772 726,982402 accept 0 0
0,0,0,2,3,4,5,20,31
sendmail 309 177 177 177 -1,-1 noflags -1,-1 1,166202 19,872092 lockf 0
0 0,0
sendmail 325 177 177 177 -1,-1 noflags -1,-1 1,229310 14,221587 lockf 0
0 0,0
httpd 326 146 146 146 -1,-1 noflags -1,-1 0,217683 22,817913 accept 0 0
0,0,0,2,3,4,5,20,31
sendmail 340 177 177 177 -1,-1 noflags -1,-1 1,175623 20,188029 lockf 0
0 0,0
sendmail 354 177 177 177 -1,-1 noflags -1,-1 0,868756 21,147577 lockf 0
0 0,0
sendmail 356 177 177 177 -1,-1 noflags -1,-1 1,29939 358,184882 lockf 0
0 0,0
sendmail 377 177 177 177 -1,-1 noflags -1,-1 0,748467 15,468319 lockf 0
0 0,0
sendmail 384 177 177 177 -1,-1 noflags -1,-1 0,820566 10,620912 lockf 0
0 0,0
sendmail 394 177 177 177 -1,-1 noflags -1,-1 0,779351 5,403506 lockf 0 0
0,0
update 4 0 0 0 -1,-1 noflags 843891950,18141 0,0 4,136158 update 0 0 0,0
paneld 40 1 40 40 -1,-1 sldr 843891978,367099 4,461534 7,731289 select 0
0 0,0
sendmail 400 177 177 177 -1,-1 noflags -1,-1 0,568258 4,269417 lockf 0 0
0,0
sendmail 407 177 177 177 -1,-1 noflags -1,-1 0,636948 4,190048 lockf 0 0
0,0
sendmail 411 177 177 177 -1,-1 noflags -1,-1 0,573944 3,427725 lockf 0 0
0,0
restartd 42 1 42 42 -1,-1 sldr 843891978,587161 0,344955 2,53304 select
0 0 0,0
sendmail 421 177 177 177 -1,-1 noflags -1,-1 0,546195 3,255322 lockf 0 0
0,0
sendmail 428 177 177 177 -1,-1 noflags -1,-1 0,560586 3,148498 lockf 0 0
0,0
sendmail 431 177 177 177 -1,-1 noflags -1,-1 0,496855 2,657426 lockf 0 0
0,0
sendmail 438 177 177 177 -1,-1 noflags -1,-1 0,429439 2,699336 lockf 0 0
0,0
sendmail 440 177 177 177 -1,-1 noflags -1,-1 0,468329 2,802541 lockf 0 0
0,0
sendmail 445 177 177 177 -1,-1 noflags -1,-1 0,374571 2,255750 lockf 0 0
0,0
sendmail 447 177 177 177 -1,-1 noflags -1,-1 0,369364 2,336634 lockf 0 0
0,0
sendmail 453 177 177 177 -1,-1 noflags -1,-1 0,310400 2,10209 lockf 0 0
0,0
telnetd 458 89 458 458 -1,-1 sldr 843897550,663780 0,190723 0,440131
select 0 0 0,0
tcsh 459 458 459 459 5,0 ctty,sldr 843897552,813667 0,628812 0,745800
pause 0 0 0,0,0,2,3,4,5,20,31
sendmail 464 177 177 177 -1,-1 noflags 843897574,892285 0,402615
1,742086 lockf 0 0 0,0
cron 468 77 77 77 -1,-1 noflags 843897601,288027 0,0 0,40903 piperd 0 0
0,0
cat 469 459 469 459 5,0 ctty 843897601,455747 0,15564 0,420248 nochan 0
0 0,0,0,2,3,4,5,20,31
sh 470 468 470 470 -1,-1 sldr 843897601,717028 0,8824 0,114718 wait 0 0
0,0,0,2,3,4,5,20,31
syslogd 75 1 75 75 -1,-1 sldr 843891986,30776 2,322467 19,973221 select
0 0 0,0
cron 77 1 77 77 -1,-1 sldr 843891986,826365 0,364523 27,618492 pause 0 0
0,0
inetd 89 1 89 89 -1,-1 sldr -1,-1 0,50930 0,263141 select 0 0 0,0
pwcheck 96 1 5 0 12,0 noflags -1,-1 0,44474 0,74123 accept 0 0 0,0
cat 469 459 469 459 5,0 ctty 843897601,455747 0,15423 0,431849 nochan 0
0 0,0,0,2,3,4,5,20,31


notice all the sendmails in 'lockf' wait.
I can suspend the system at this moment and wander around 
using gdb.
I can even get anyone else
into the system over the net
and give them a gdb session to poke around in this system
I'm kinda not sure of what to look at to try work out
what the problem is..

right now I removed the external load, so the system is responsive, but
the sendmails are all still locked..
<CTL>ALT><ESC> drops it into the remote-gdb debugger
if I need to look at something....
any suggestions?

julian



Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?324CDD35.167EB0E7>