Skip site navigation (1)Skip section navigation (2)
Date:      Mon, 17 Dec 2001 13:06:04 +0100 (CET)
From:      Martin Blapp <mb@imp.ch>
To:        <freebsd-hackers@freebsd.org>
Subject:   deadlock with softupdates ?
Message-ID:  <20011217125752.P43207-100000@levais.imp.ch>

next in thread | raw e-mail | index | archive | help

We have a newserver running here, which freezes
every 2-3 days. Break into db is still possible.

It is a STABLE 4.4 machine, with new kernel.

NEWS # mount
/dev/da0s1a on / (ufs, local, soft-updates)
/dev/da0s1e on /var (ufs, local, soft-updates)
/dev/da1s1e on /news (ufs, local, soft-updates)
/dev/da2s1e on /news/spool/overview (ufs, local, soft-updates)
procfs on /proc (procfs, local)
/dev/twed0s1e on /news/spool/articles/1 (ufs, local, soft-updates)
/dev/twed1s1e on /news/spool/articles/2 (ufs, local, soft-updates)

The latter two filesystems have 10 big files with the cyclic news
filesystem on them. So we have a filesystem on a filesystem.

I guess this is what makes the problems.

I've got a kernel dump of a hanged session, but it
is not very informative.

bash-2.05a# ps -M /var/crash/vmcore.0 -N
/usr/src/sys/compile/NEWS/kernel.deb-axl | more
  UID   PID  PPID CPU PRI NI   VSZ  RSS WCHAN  STAT  TT       TIME COMMAND
    8   289     1 100  10  0   668    0 wait   IW   #C5-   0:00.00  (sh)
    8   290     1 100  10  0   668    0 wait   IW   #C5-   0:00.00  (sh)
    8   292   289   0 -18  0   680    0 vmwait DL   #C5-   0:30.78  (sh)
    8   293   290   0 -14  0  3644    0 inode  D    #C5-   0:49.73  (perl)
    0 12263     1   0  10  0  1200    0 wait   IWs  #C5    0:00.00
(login)
    0 12295 12263   0  18  0  1324    0 pause  IW   #C5    0:00.00  (csh)
    0 12297 12295   0  10  0  1060    0 wait   IW   #C5    0:00.00  (bash)
    0 12300 12297   0  -5  0  1532    0 sysctl D+   #C5    6:58.09
(systat)
 1042   677   671   0  18  0   580    0 pause  IWs  #C5    0:00.00  (ksh)
    0   689   677   0  18  0  1336    0 pause  IW   #C5    0:00.00  (csh)
    0   696   689   0  18  0   588    0 pause  I    #C5    0:00.03  (ksh)
    8 13284   696  12  10  0   632    0 wait   I    #C5    0:00.01  (sh)
    8 13742 13284  12  10  0   636    0 wait   I+   #C5    0:00.01  (sh)
    8 13750 13742   0 -18  0 172960    0 vmwait DL+  #C5    0:01.82
(suck)
    0   317     1   0   3  0   948    0 ttyin  IWs+ #C1    0:00.00
(getty)
    0   316     1   0   3  0   948    0 ttyin  IWs+ #C1    0:00.00
(getty)
    0   315     1   0   3  0   948    0 ttyin  IWs+ #C1    0:00.00
(getty)
    0   314     1   0   3  0   948    0 ttyin  IWs+ #C1    0:00.00
(getty)
    0   313     1   0   3  0   948    0 ttyin  IWs+ #C1    0:00.00
(getty)
    0   312     1   0   3  0   948    0 ttyin  IWs+ #C1    0:00.00
(getty)
    0   319     1   0   3  0   948    0 ttyin  IWs+ #C1    0:00.00
(getty)
    0   318     1   0   3  0   948    0 ttyin  IWs+ #C1    0:00.00
(getty)
    0 12307 12306   0  10  0  1064    0 wait   IWs  #C9    0:00.00  (bash)
    0 12312 12307   0  10  0  1064    0 wait   IW   #C9    0:00.00  (bash)
    0 12313 12312   0 -18  0  2012    0 vmwait D+   #C9    8:22.42  (top)
    0 59816 59812   0   3  0  1064    0 ttyin  Is+  #C6    0:00.04  (bash)
 1042 35037 35035   2  18  0   580    0 pause  IWs  #C6    0:00.00  (ksh)
    0 35050 35037   0  18  0  1336    0 pause  IW   #C6    0:00.00  (csh)
    0 35059 35050   0   3  0   600    0 ttyin  I+   #C6    0:00.17  (ksh)
    0     0     0   0 -18  0     0    0 sched  DLs   ??    0:00.00
(swapper)
    0     1     0   0  10  0   544    0 wait   ILs   ??    0:00.01  (init)
    0     2     0   0 -14  0     0    0 inode  DL    ??    0:36.22
(pagedaemon
    0     3     0  38  18  0     0    0 psleep DL    ??    0:00.61
(vmdaemon)
    0     4     0   0 -18  0     0    0 psleep DL    ??    0:43.03
(bufdaemon)
    0     5     0   0  18  0     0    0 syncer DL    ??   10:42.21
(syncer)
    0   137     1   0 -18  0  2404    0 vmwait DLs   ??    0:54.90
(gated)
    0   149     1   0   2  0   960    0 select Ss    ??    3:42.56
(syslogd)
    0   158     1   0   2 -12  1272    0 select S<s   ??    0:11.33
(ntpd)
    0   177     1   0   2  0  1056    0 select Is    ??    0:00.00
(inetd)
    0   179     1   0 -18  0   984    0 vmwait DLs   ??    0:00.77  (cron)
    0   182     1   9   2  0  2216    0 select Is    ??    0:11.13  (sshd)
    0   184     1   0   2  0   908    0 select Ss    ??    0:00.36  (usbd)
    0   221     1   0   2  0  2084    0 select Ss    ??    0:07.75
(httpd)
    0   229     1   0   2  0   932    0 select Ss    ??    3:55.50
(healthd)
65534   238   221   0   2  0  2148    0 accept I     ??    0:00.13
(httpd)
65534   239   221   0   2  0  2148    0 accept I     ??    0:00.12
(httpd)
65534   240   221   0   2  0  2148    0 accept I     ??    0:00.12
(httpd)
65534   241   221   0   2  0  2148    0 accept I     ??    0:00.12
(httpd)
65534   242   221   0   2  0  2148    0 accept I     ??    0:00.12
(httpd)
    8   288     1   0 -14  0 133420    0 inode  DLs   ??   28:25.45
(innd)
    0   307     1  34   2  0  1580    0 select IWs   ??    0:00.00
(radiusd)
    0   671   182   0   2  0  2320    0 select I     ??    0:00.85  (sshd)
    8 10135   288   0  36  4     0    0 -      ZN    ??    0:00.00
(nnrpd)
65534 10363   221   0   2  0  2152    0 accept I     ??    0:00.12
(httpd)
65534 10364   221   0   2  0  2152    0 accept I     ??    0:00.12
(httpd)
65534 10366   221   0   2  0  2152    0 accept I     ??    0:00.12
(httpd)
65534 10367   221   0   2  0  2152    0 accept I     ??    0:00.12
(httpd)
65534 10368   221   0   2  0  2152    0 accept I     ??    0:00.12
(httpd)
    8 11811   288   0   2  4 48124    0 select IN    ??    0:01.14
(nnrpd)
    0 12306   182   0 -18  0  2320    0 vmwait DL    ??    0:29.42  (sshd)
    8 13265   288   0  36  4     0    0 -      ZN    ??    0:00.00
(nnrpd)
    8 13333   288   0 -18  4 46448    0 vmwait DNL   ??    0:00.03
(nnrpd)
    8 13338   288   0 -18  4 46448    0 vmwait DNL   ??    0:00.04
(nnrpd)
8 11811   288   0   2  4 48124    0 select IN    ??    0:01.14  (nnrpd)
    0 12306   182   0 -18  0  2320    0 vmwait DL    ??    0:29.42  (sshd)
    8 13265   288   0  36  4     0    0 -      ZN    ??    0:00.00
(nnrpd)
    8 13333   288   0 -18  4 46448    0 vmwait DNL   ??    0:00.03
(nnrpd)
    8 13338   288   0 -18  4 46448    0 vmwait DNL   ??    0:00.04
(nnrpd)
    8 13390   288   0 -18  4 46468    0 vmwait DNL   ??    0:00.02
(nnrpd)
    8 13447   288   0  36  4     0    0 -      ZN    ??    0:00.00
(nnrpd)
    8 13764   288   0  36  4     0    0 -      ZN    ??    0:00.00
(nnrpd)
    8 13767   288   0 -18  4  2052    0 vmwait DNL   ??    0:00.00
(nnrpd)
    0 13768   179   0  10  0   984    0 ppwait D     ??    0:00.00  (cron)
    0 13769 13768   0 -18  0   984    0 vmwait DVLs  ??    0:00.00  (cron)
    8 13770   288   0 -18  4  2052    0 vmwait DNL   ??    0:00.00
(nnrpd)
    8 13771   288   0 -14  4  2072    0 inode  DN    ??    0:00.01
(nnrpd)
    8 13772   288   0 -18  4  2128    0 vmwait DNL   ??    0:00.01
(nnrpd)
    8 13773   288   0 -18  4  2052    0 vmwait DNL   ??    0:00.00
(nnrpd)
    8 13774   288   0 -18  4  2108    0 vmwait DNL   ??    0:00.01
(nnrpd)
    8 13775   288   0 -14  4  2128    0 inode  DN    ??    0:00.01
(nnrpd)
    8 13776   288   0 -18  4  2080    0 vmwait DNL   ??    0:00.00
(nnrpd)
    8 13777   288   0 -18  4   584    0 vmwait DNL   ??    0:00.00
(nnrpd)
    8 13778   288   0 -18  4   584    0 vmwait DNL   ??    0:00.00
(nnrpd)
    8 13779   288   0 -14  4  2120    0 inode  DN    ??    0:00.01
(nnrpd)
    8 13780   288   0 -18  4  1916    0 vmwait DNL   ??    0:00.00
(nnrpd)
    8 13781   288   0 -14  4 45968    0 inode  DN    ??    0:00.01
(nnrpd)
    0 13782   179   0  28  0   680    0 -      R     ??    0:00.00  (cron)
    8 13783   292   0 -18  0   680    0 vmwait DL    ??    0:00.00  (sh)
    0 35035   182   0   2  0  2320    0 select I     ??    0:01.03  (sshd)
    0 59812   182   0 -18  0  2320    0 vmwait DL    ??    0:00.10  (sshd)
    8 70957   288   0   2  4  1632    0 sbwait IN    ??    0:06.20
(ninpaths)
    0 75942     1   0   2  0  1564    0 select Is    ??    0:00.02
(nsrexecd)
    0 75943 75942   0   2  0  1556    0 select S     ??    0:00.15
(nsrexecd)
    8 76594   288   0   2  4 45876    0 select IN    ??    4:15.73
(innfeed)
    8 76595   288   0   2  4  3144    0 sbwait IN    ??    2:16.47
(overchan)

Anyway, I noted that many processes are stuck at the same place in
ffs_read:

(kgdb) defproc 13769
13769 d78cd0c0 d78bb000    0 13768 13769  800014 3  cron         vmwait
c031f718
 frame 0 at 0xd78bdc08: ebp d78bdc34, eip 0xc017a529 <tsleep+417>:
mov    0x141(%ebx),%al
 frame 1 at 0xd78bdc34: ebp d78bdc50, eip 0xc0238897 <vm_await+35>:
add    $0x10,%esp
 frame 2 at 0xd78bdc50: ebp d78bdd00, eip 0xc0230574 <vm_fault+192>:
add    $0x8,%esp
 frame 3 at 0xd78bdd00: ebp d78bdd28, eip 0xc0284b9e <trap_pfault+118>:
mov    %eax,%ecx
 frame 4 at 0xd78bdd28: ebp d78bdd6c, eip 0xc02847af <trap+767>:
jmp    0xc0284afe <trap+1614>
 frame 5 at 0xd78bdd6c: ebp d78bdde4, eip 0xc0276730 <alltraps>:
push   %ebx
 frame 6 at 0xd78bdde4: ebp d78bde48, eip 0xc0226dbe <ffs_read+842>:
mov    %eax,0xffffffd4(%ebp)
 frame 7 at 0xd78bde48: ebp d78bde7c, eip 0xc01aba18 <vn_read+300>:
mov    %eax,0xffffffe8(%ebp)
 frame 8 at 0xd78bde7c: ebp d78bdef8, eip 0xc0185cc0 <dofileread+196>:
mov    %eax,%esi
 frame 9 at 0xd78bdef8: ebp d78bdf2c, eip 0xc0185b86 <read+74>: mov
%eax,%esi
 frame 10 at 0xd78bdf2c: ebp d78bdfa0, eip 0xc028520d <syscall2+465>:
mov    %eax,0xffffffb8(%ebp)

(kgdb) defproc 13750
13750 d78cd0c0 d7b9b000    8 13742 13742  804006 3  suck         vmwait
c031f718
 frame 0 at 0xd7b9dc08: ebp d7b9dc34, eip 0xc017a529 <tsleep+417>:
mov    0x141(%ebx),%al
 frame 1 at 0xd7b9dc34: ebp d7b9dc50, eip 0xc0238897 <vm_await+35>:
add    $0x10,%esp
 frame 2 at 0xd7b9dc50: ebp d7b9dd00, eip 0xc0230574 <vm_fault+192>:
add    $0x8,%esp
 frame 3 at 0xd7b9dd00: ebp d7b9dd28, eip 0xc0284b9e <trap_pfault+118>:
mov    %eax,%ecx
 frame 4 at 0xd7b9dd28: ebp d7b9dd6c, eip 0xc02847af <trap+767>:
jmp    0xc0284afe <trap+1614>
 frame 5 at 0xd7b9dd6c: ebp d7b9dde4, eip 0xc0276730 <alltraps>:
push   %ebx
 frame 6 at 0xd7b9dde4: ebp d7b9de48, eip 0xc0226dbe <ffs_read+842>:
mov    %eax,0xffffffd4(%ebp)
 frame 7 at 0xd7b9de48: ebp d7b9de7c, eip 0xc01aba18 <vn_read+300>:
mov    %eax,0xffffffe8(%ebp)
 frame 8 at 0xd7b9de7c: ebp d7b9def8, eip 0xc0185cc0 <dofileread+196>:
mov    %eax,%esi
 frame 9 at 0xd7b9def8: ebp d7b9df2c, eip 0xc0185b86 <read+74>: mov
%eax,%esi
 frame 10 at 0xd7b9df2c: ebp d7b9dfa0, eip 0xc028520d <syscall2+465>:
mov    %eax,0xffffffb8(%ebp)

Do you have a clue how I can debug this problem ?

Martin

Martin Blapp, mb@imp.ch
------------------------------------------------------------------
Improware AG, UNIX solution and service provider
Zurlindenstrasse 29, 4133 Pratteln, Switzerland
Phone: +41 061 826 93 00: +41 61 826 93 01
PGP Fingerprint: 57E 7CCD 2769 E7AC C5FA  DF2C 19C6 DCD1 1B3A EC9C
------------------------------------------------------------------


To Unsubscribe: send mail to majordomo@FreeBSD.org
with "unsubscribe freebsd-hackers" in the body of the message




Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?20011217125752.P43207-100000>