Skip site navigation (1)Skip section navigation (2)
Date:      Tue, 2 Oct 2012 13:26:37 +0300
From:      Nikolay Denev <ndenev@gmail.com>
To:        "<freebsd-fs@freebsd.org>" <freebsd-fs@FreeBSD.ORG>
Subject:   nfs + zfs hangs on RELENG_9
Message-ID:  <906543F2-96BD-4519-B693-FD5AFB646F87@gmail.com>

next in thread | raw e-mail | index | archive | help
Hi,

I'm experimenting on a machine with both CTL and NFS exports to a Linux =
host to run a Oracle DB instance.
And now I'm seeing for a second time a ZFS IO hang after some activity =
over the NFS mount.
Now all IO to the pool where the NFS share is hangs, for example I did :

root@goliath:/home/ndenev # du -ach /tank

And it just hangs in there. procstat -kk output of the dd command :

root@goliath:/home/ndenev # procstat -kk 38266
  PID    TID COMM             TDNAME           KSTACK                    =
  =20
38266 101997 du               -                mi_switch+0x186 =
sleepq_wait+0x42 _cv_wait+0x121 txg_wait_open+0x85 dmu_tx_assign+0x170 =
zfs_inactive+0xf1 zfs_freebsd_inactive+0x1a vinactive+0x8d vputx+0x2d8 =
sys_fchdir+0x35a amd64_syscall+0x546 Xfast_syscall+0xf7=20

nfsd process is also in D state :

root     1420    0.0  0.0   9912  584 ??  Ss   Thu08PM     0:00.04 nfsd: =
master (nfsd)
root     1422    0.0  0.0   9912 2332 ??  D    Thu08PM  5347:34.58 nfsd: =
server (nfsd)

root@goliath:/home/ndenev # procstat -kk 1420
  PID    TID COMM             TDNAME           KSTACK                    =
  =20
 1420 100740 nfsd             -                mi_switch+0x186 =
sleepq_catch_signals+0x2cc sleepq_wait_sig+0xc _cv_wait_sig+0x12e =
seltdwait+0x110 kern_select+0x6ed sys_select+0x5d amd64_syscall+0x546 =
Xfast_syscall+0xf7=20

root@goliath:/home/ndenev # procstat -kk 1422
  PID    TID COMM             TDNAME           KSTACK                    =
  =20
 1422 101178 nfsd             nfsd: master     mi_switch+0x186 =
sleepq_wait+0x42 _sx_xlock_hard+0x428 _sx_xlock+0x51 arc_buf_evict+0x84 =
dbuf_clear+0x91 dbuf_free_range+0x469 dnode_free_range+0x2c4 =
dmu_free_long_range_impl+0x13d dmu_free_long_range+0x4c zfs_rmnode+0x69 =
zfs_inactive+0x66 zfs_freebsd_inactive+0x1a vinactive+0x8d vputx+0x2d8 =
zfs_freebsd_rename+0xe1 VOP_RENAME_APV+0x46 nfsvno_rename+0x2cc=20

[=85 showing only master thread, as I'm running with maxthreads 196 and =
the output is too verbose, I can provide it if needed =85]=20

There are these two kernel threads that look also waiting on some lock :

    0 100423 kernel           zio_write_intr_6 mi_switch+0x186 =
sleepq_wait+0x42 _sx_xlock_hard+0x428 _sx_xlock+0x51 =
buf_hash_insert+0x56 arc_write_done+0x8e zio_done+0x2ee zio_execute+0xc3 =
zio_done+0x367 zio_execute+0xc3 zio_done+0x367 zio_execute+0xc3 =
zio_done+0x367 zio_execute+0xc3 zio_done+0x367 zio_execute+0xc3 =
zio_done+0x367 zio_execute+0xc3=20
    0 101699 kernel           zio_write_intr_7 mi_switch+0x186 =
sleepq_wait+0x42 _sx_xlock_hard+0x428 _sx_xlock+0x51 =
buf_hash_insert+0x56 arc_write_done+0x8e zio_done+0x2ee zio_execute+0xc3 =
zio_done+0x367 zio_execute+0xc3 zio_done+0x367 zio_execute+0xc3 =
taskqueue_run_locked+0x85 taskqueue_thread_loop+0x46 fork_exit+0x11f =
fork_trampoline+0xe=20

And these threads are from zfskern kernel process :

root@goliath:/home/ndenev # procstat -kk 7
  PID    TID COMM             TDNAME           KSTACK                    =
  =20
    7 100192 zfskern          arc_reclaim_thre mi_switch+0x186 =
sleepq_wait+0x42 _sx_xlock_hard+0x428 _sx_xlock+0x51 =
arc_buf_remove_ref+0x8a dbuf_rele_and_unlock+0x132 dbuf_evict+0x11 =
dbuf_do_evict+0x53 arc_do_user_evicts+0xb4 arc_reclaim_thread+0x263 =
fork_exit+0x11f fork_trampoline+0xe=20
    7 100193 zfskern          l2arc_feed_threa mi_switch+0x186 =
sleepq_timedwait+0x42 _cv_timedwait+0x13c l2arc_feed_thread+0x1a3 =
fork_exit+0x11f fork_trampoline+0xe=20
    7 100536 zfskern          txg_thread_enter mi_switch+0x186 =
sleepq_wait+0x42 _cv_wait+0x121 txg_thread_wait+0x79 =
txg_quiesce_thread+0xb5 fork_exit+0x11f fork_trampoline+0xe=20
    7 100537 zfskern          txg_thread_enter mi_switch+0x186 =
sleepq_wait+0x42 _cv_wait+0x121 zio_wait+0x61 dsl_pool_sync+0xe0 =
spa_sync+0x336 txg_sync_thread+0x136 fork_exit+0x11f fork_trampoline+0xe=20=

    7 101811 zfskern          txg_thread_enter mi_switch+0x186 =
sleepq_wait+0x42 _cv_wait+0x121 txg_quiesce_thread+0x1fb fork_exit+0x11f =
fork_trampoline+0xe=20
    7 101812 zfskern          txg_thread_enter mi_switch+0x186 =
sleepq_wait+0x42 _cv_wait+0x121 zio_wait+0x61 dsl_pool_sync+0x2c3 =
spa_sync+0x336 txg_sync_thread+0x136 fork_exit+0x11f fork_trampoline+0xe=20=

    7 101813 zfskern          zvol tank/oracle mi_switch+0x186 =
sleepq_wait+0x42 _cv_wait+0x121 txg_wait_open+0x85 dmu_tx_assign+0x170 =
zvol_strategy+0x27a zvol_geom_worker+0xf9 fork_exit+0x11f =
fork_trampoline+0xe=20
    7 101817 zfskern          zvol tank/oracle mi_switch+0x186 =
sleepq_wait+0x42 _sleep+0x390 zvol_geom_worker+0xec fork_exit+0x11f =
fork_trampoline+0xe=20
    7 101822 zfskern          zvol tank/oracle mi_switch+0x186 =
sleepq_wait+0x42 _sleep+0x390 zvol_geom_worker+0xec fork_exit+0x11f =
fork_trampoline+0xe=20
    7 101824 zfskern          zvol tank/oracle mi_switch+0x186 =
sleepq_wait+0x42 _cv_wait+0x121 txg_wait_open+0x85 dmu_tx_assign+0x170 =
zvol_strategy+0x27a zvol_geom_worker+0xf9 fork_exit+0x11f =
fork_trampoline+0xe=20


I'm running with the following NFS settings :

nfs_server_enable=3D"YES"
nfsv4_server_enable=3D"YES"
nfsuserd_enable=3D"YES"
rpc_statd_enable=3D"YES"
rpc_lockd_enable=3D"YES"

vfs.nfsd.minthreads=3D32
vfs.nfsd.maxthreads=3D196

root@goliath:/home/ndenev # cat /etc/exports
V4: /tank -sec=3Dsys -network 10.0.0.0/8
/tank/oracle_db2	-maproot=3Droot -alldirs -network 10.XX.XX.XX =
-mask 255.255.255.255=20

And the mount on the Linux machine is :

goliath.neterra.moneybookers.net:/tank/oracle_db2 /mnt/goliath2 nfs	=
rw,bg,hard,nointr,tcp,actimeo=3D0,vers=3D3,timeo=3D600,rsize=3D32768,wsize=
=3D32768 0 0

Any idea how this IO hang can be debugged further? Maybe more =
information (or less :) ) is needed?





Want to link to this message? Use this URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?906543F2-96BD-4519-B693-FD5AFB646F87>